Skip to content

Commit 055051f

Browse files
authored
0.1.0 (#26)
* create v0.1.0 * simplfying hamstr.pl * first python conversion * modified addTaxon and addTaxa * added install/check dependencies * rename data folders * added runtime for core complilation
1 parent d8eea1f commit 055051f

39 files changed

+2940
-8134
lines changed

.DS_Store

-6 KB
Binary file not shown.

.github/workflows/github_build.yml

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@ name: build
55

66
on:
77
push:
8-
branches: [ master ]
8+
branches:
9+
- master
10+
- 0.1.0
911
tags:
10-
- '*'
12+
- '*'
1113
pull_request:
1214
branches: [ master ]
1315

@@ -38,10 +40,17 @@ jobs:
3840
run: |
3941
pwd
4042
pip install .
41-
fdog.setup -o /home/runner/work/fDOG/fDOG/dt --lib
42-
fdog.setup -o /home/runner/work/fDOG/fDOG/dt
43+
path=$(fdog.setup -d ./ --getSourcepath); for i in $(less $path/data/dependencies.txt); do sudo apt-get install -y -qq $i; done
44+
fdog.setup -d /home/runner/work/fDOG/fDOG/dt --woFAS
45+
fdog.checkData -s /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir -c /home/runner/work/fDOG/fDOG/dt/coreTaxa_dir -a /home/runner/work/fDOG/fDOG/dt/annotation_dir --reblast
4346
fdog.showTaxa
44-
fdog.run --seqFile infile.fa --seqName test --refspec HUMAN@9606@3 --fasoff
47+
fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3 --fasOff
48+
mkdir seeds
49+
path=$(fdog.setup -d ./ --getSourcepath); a="1 2 3"; for i in ${a[@]}; do cp $path/data/infile.fa seeds/$i.fa; done
50+
fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@3 --fasOff --searchTaxa PARTE@5888@3,THAPS@35128@3
51+
head /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir/HUMAN@9606@3/HUMAN@[email protected] > hm.fa
52+
fdog.addTaxon -f hm.fa -i 9606 -o ./ -c -a
53+
ls
4554
- name: Deploy
4655
if: startsWith(github.event.ref, 'refs/tags')
4756
uses: casperdcl/deploy-pypi@v2

fdog/.DS_Store

-8 KB
Binary file not shown.

fdog/addTaxa.py

Lines changed: 89 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
11
# -*- coding: utf-8 -*-
22

33
#######################################################################
4-
# Copyright (C) 2020 Vinh Tran
4+
# Copyright (C) 2022 Vinh Tran
55
#
66
# This script is used to prepare data for fdog.
7-
# For each given genome FASTA file, It will create a folder within genome_dir
7+
# For each given genome FASTA file, It will create a folder within searchTaxa_dir
88
# with the naming scheme of fdog ([Species acronym]@[NCBI ID]@[Proteome version]
9-
# e.g HUMAN@9606@3), a annotation file in JSON format in weight_dir and
10-
# a blast DB in blast_dir folder (optional).
11-
# For a long header of original FASTA sequence, only the first word
12-
# will be taken as the ID of new fasta file, everything after the
13-
# first whitespace will be removed. If this first word is not unique,
14-
# an automatically increasing index will be added.
9+
# e.g HUMAN@9606@3), a annotation file in JSON format in annotation_dir and
10+
# a blast DB in coreTaxa_dir folder (optional).
1511
#
1612
# This script is distributed in the hope that it will be useful,
1713
# but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -26,98 +22,74 @@
2622
import sys
2723
import os
2824
import argparse
29-
from os import listdir
30-
from os.path import isfile, join
3125
from pathlib import Path
32-
import subprocess
26+
from Bio import SeqIO
3327
import multiprocessing as mp
28+
from tqdm import tqdm
3429
from ete3 import NCBITaxa
35-
import csv
36-
from io import StringIO
3730
import re
3831
import shutil
39-
from tqdm import tqdm
4032
from datetime import datetime
33+
import time
34+
from pkg_resources import get_distribution
35+
from collections import OrderedDict
4136

42-
def checkFileExist(file):
43-
if not os.path.exists(os.path.abspath(file)):
44-
sys.exit('%s not found' % file)
37+
import fdog.libs.zzz as general_fn
38+
import fdog.libs.tree as tree_fn
39+
import fdog.libs.addtaxon as add_taxon_fn
4540

46-
def getTaxName(taxId):
47-
ncbi = NCBITaxa()
48-
try:
49-
ncbiName = ncbi.get_taxid_translator([taxId])[int(taxId)]
50-
ncbiName = re.sub('[^a-zA-Z1-9\s]+', '', ncbiName)
51-
taxName = ncbiName.split()
52-
name = taxName[0][:3].upper()+taxName[1][:2].upper()
53-
except:
54-
name = "UNK" + taxId
55-
return(name)
5641

57-
def parseMapFile(mappingFile):
58-
nameDict = {}
59-
with open(mappingFile) as f:
42+
def parse_map_file(mapping_file, folIn):
43+
""" Create spec name from mapping file
44+
And also check if given input files in mapping file exist
45+
"""
46+
name_dict = {}
47+
with open(mapping_file) as f:
6048
for line in f:
6149
if not '#' in line:
6250
tmp = line.split('\t')
63-
fileName = tmp[0]
64-
taxId = tmp[1].strip()
51+
file_name = tmp[0]
52+
file_in = '%s/%s' % (folIn, file_name)
53+
general_fn.check_file_exist(file_in)
54+
tax_id = tmp[1].strip()
6555
try:
66-
taxName = tmp[2].strip()
56+
tax_name = tmp[2].strip()
6757
except:
68-
taxName = getTaxName(taxId)
58+
tax_name = ''
6959
try:
7060
ver = tmp[3].strip()
7161
except:
72-
ver = datetime.today().strftime('%y%m%d') #1
73-
# print(taxName+"@"+str(taxId)+"@"+str(ver))
74-
nameDict[fileName] = (taxName, str(taxId), str(ver))
75-
return(nameDict)
62+
ver = datetime.today().strftime('%y%m%d')
63+
spec_name = add_taxon_fn.generate_spec_name(tax_id, tax_name, ver)
64+
name_dict[file_in] = spec_name
65+
return(name_dict)
7666

77-
def runAddTaxon(args):
78-
(f,n,i,o,c,v,a,cpus,replace,delete) = args
79-
cmd = 'fdog.addTaxon -f %s -n %s -i %s -o %s -v %s --cpus %s' % (f,n,i,o,v,cpus)
80-
if c == True:
81-
cmd = cmd + ' -c'
82-
if a == True:
83-
cmd = cmd + ' -a'
84-
if replace == True:
85-
cmd = cmd + ' --replace'
86-
if delete == True:
87-
cmd = cmd + ' --delete'
88-
# print(cmd)
89-
logFile = o + '/addTaxa2fDog.log'
90-
cmd = cmd + ' >> ' + logFile
91-
try:
92-
subprocess.call([cmd], shell = True)
93-
except:
94-
sys.exit('Problem running\n%s' % (cmd))
9567

9668
def main():
97-
version = '0.0.9'
98-
parser = argparse.ArgumentParser(description='You are running fdog.addTaxa version ' + str(version) + '.')
69+
version = get_distribution('fdog').version
70+
parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.')
9971
required = parser.add_argument_group('required arguments')
10072
optional = parser.add_argument_group('optional arguments')
10173
required.add_argument('-i', '--input', help='Path to input folder', action='store', default='', required=True)
10274
required.add_argument('-m', '--mapping',
103-
help='Tab-delimited text file containing <fasta_filename>tab<taxonID>tab<taxonName>tab<genome_version>. The last 2 columns are optional.',
75+
help='Tab-delimited text file containing <fasta_file_name>tab<taxonID>tab<taxonName>tab<genome_version>. The last 2 columns are optional.',
10476
action='store', default='', required=True)
10577
optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='')
106-
optional.add_argument('-c', '--coreTaxa', help='Include these taxa to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False)
107-
optional.add_argument('-a', '--noAnno', help='Do NOT annotate these taxa using fas.doAnno', action='store_true', default=False)
78+
optional.add_argument('-c', '--coreTaxa', help='Include this taxon to core taxa (i.e. taxa in coreTaxa_dir folder)', action='store_true', default=False)
79+
optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using fas.doAnno', action='store_true', default=False)
10880
optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int)
10981
optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False)
11082
optional.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False)
111-
optional.add_argument('-f', '--force', help='Force overwrite existing data', action='store_true', default=False)
83+
optional.add_argument('--force', help='Force overwrite existing data', action='store_true', default=False)
11284

113-
### get arguments
11485
args = parser.parse_args()
11586
folIn = args.input
87+
folIn = os.path.abspath(folIn)
11688
mapping = args.mapping
117-
checkFileExist(mapping)
89+
general_fn.check_file_exist(mapping)
11890
outPath = args.outPath
11991
if outPath == '':
120-
fdogPath = os.path.realpath(__file__).replace('/addTaxa.py','')
92+
fdogPath = os.path.realpath(__file__).replace('/addTaxon.py','')
12193
pathconfigFile = fdogPath + '/bin/pathconfig.txt'
12294
if not os.path.exists(pathconfigFile):
12395
sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).')
@@ -131,61 +103,63 @@ def main():
131103
cpus = mp.cpu_count()-2
132104
replace = args.replace
133105
delete = args.delete
106+
add_taxon_fn.check_conflict_opts(replace, delete)
134107
force = args.force
135108

109+
start = time.time()
110+
### parse mapping file
111+
name_dict = parse_map_file(mapping, folIn)
136112

137-
### get existing genomes
138-
Path(outPath + "/genome_dir").mkdir(parents = True, exist_ok = True)
139-
Path(outPath + "/weight_dir").mkdir(parents = True, exist_ok = True)
140-
genomeFiles = listdir(outPath + "/genome_dir")
141-
142-
### generate taxon names from mapping file
143-
nameDict = parseMapFile(mapping)
144-
145-
### read all input fasta files and create addTaxon jobs
146-
jobs = []
147-
dupList = {}
148-
faFiles = [f for f in listdir(folIn) if isfile(join(folIn, f))]
149-
for f in faFiles:
150-
# tmp = f.split('.')
151-
if f in nameDict:
152-
# check duplicated taxon name in existing data
153-
taxName = '@'.join(nameDict[f])
154-
flag = 1
155-
if taxName in genomeFiles:
156-
if force:
157-
shutil.rmtree(outPath + "/genome_dir/" + taxName)
158-
if not noAnno:
159-
shutil.rmtree(outPath + "/weight_dir/" + taxName)
160-
else:
161-
flag = 0
162-
dupList[f] = taxName
113+
### initiate paths
114+
Path(outPath + '/searchTaxa_dir').mkdir(parents = True, exist_ok = True)
163115

164-
if flag == 1:
165-
fasta = folIn + '/' + f
166-
name = nameDict[f][0]
167-
taxid = nameDict[f][1]
168-
verProt = nameDict[f][2]
169-
jobs.append([
170-
folIn + '/' + f, nameDict[f][0], nameDict[f][1],
171-
outPath, coreTaxa, nameDict[f][2], noAnno, cpus, replace, delete
172-
])
173-
174-
if len(dupList) > 0:
175-
print("These taxa are probably already present in %s:" % (outPath + "/genome_dir"))
176-
for f in dupList:
177-
print('\t'+f+'\t'+dupList[f])
116+
### create file in searchTaxa_dir [and coreTaxa_dir]
117+
genome_jobs = []
118+
blast_jobs = []
119+
for f in name_dict:
120+
spec_name = name_dict[f]
121+
## remove old folder if force is set
178122
if force:
179-
print('They will be deleted and re-compiled!')
180-
else:
181-
sys.exit("Please remove them from the mapping file or use different Name/ID/Version!")
182-
183-
print('Parsing...')
184-
for job in tqdm(jobs):
185-
# print('@'.join([job[1],job[2],job[5]]) + '\t' + job[0])
186-
runAddTaxon(job)
187-
188-
print('Output can be found in %s' % outPath)
123+
if os.path.exists(outPath + '/searchTaxa_dir/' + spec_name):
124+
shutil.rmtree(outPath + '/searchTaxa_dir/' + spec_name)
125+
if os.path.exists(outPath + '/coreTaxa_dir/' + spec_name):
126+
shutil.rmtree(outPath + '/coreTaxa_dir/' + spec_name)
127+
## create jobs
128+
genome_path = '%s/searchTaxa_dir/%s' % (outPath, spec_name)
129+
Path(genome_path).mkdir(parents = True, exist_ok = True)
130+
genome_jobs.append([f, genome_path, spec_name, force, replace, delete])
131+
if coreTaxa:
132+
genome_file = '%s/%s.fa' % (genome_path, spec_name)
133+
blast_jobs.append([outPath, spec_name, genome_file, force, True])
134+
pool = mp.Pool(cpus)
135+
136+
print('Parsing genome for %s species...' % len(genome_jobs))
137+
genome_out = []
138+
for _ in tqdm(pool.imap_unordered(add_taxon_fn.create_genome, genome_jobs),
139+
total=len(genome_jobs)):
140+
genome_out.append(_)
141+
out_msg = 'Output for %s can be found in %s within searchTaxa_dir' % (spec_name, outPath)
142+
if len(blast_jobs) > 0:
143+
print('\nCreating Blast DB for %s species...' % len(blast_jobs))
144+
blast_out = []
145+
for _ in tqdm(pool.imap_unordered(add_taxon_fn.create_blastdb, blast_jobs),
146+
total=len(blast_jobs)):
147+
blast_out.append(_)
148+
out_msg = '%s, coreTaxa_dir' % out_msg
149+
150+
### create annotation
151+
if not noAnno:
152+
Path(outPath + '/annotation_dir').mkdir(parents = True, exist_ok = True)
153+
for f in name_dict:
154+
genome_file = '%s/searchTaxa_dir/%s/%s.fa' % (outPath, name_dict[f], name_dict[f])
155+
add_taxon_fn.create_annoFile(outPath, genome_file, cpus, force)
156+
if os.path.exists('%s/annotation_dir/tmp' % outPath):
157+
shutil.rmtree('%s/annotation_dir/tmp' % outPath)
158+
out_msg = '%s, annotation_dir' % out_msg
159+
160+
end = time.time()
161+
print('==> Adding %s taxa finished in %s' % (len(name_dict), '{:5.3f}s'.format(end - start)))
162+
print('==> %s' % out_msg)
189163

190164
if __name__ == '__main__':
191165
main()

0 commit comments

Comments
 (0)