Skip to content

Commit e5b283f

Browse files
authored
Merge pull request #15 from BIONF/dev
Upgrade NCBI Datasets API to v2 & Refactor Taxonomy Check
2 parents 66cb51c + 9ca2a5b commit e5b283f

13 files changed

+26
-145
lines changed

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "XspecT"
3-
version = "0.2.6"
3+
version = "0.2.7"
44
description = "Tool to monitor and characterize pathogens using Bloom filters."
55
readme = {file = "README.md", content-type = "text/markdown"}
66
license = {file = "LICENSE"}

src/xspect/mlst_feature/mlst_helper.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
""" Module for utility functions used in other modules regarding MLST. """
1+
"""Module for utility functions used in other modules regarding MLST."""
22

33
__author__ = "Cetin, Oemer"
44

src/xspect/models/result.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
""" Module for storing the results of XspecT models. """
1+
"""Module for storing the results of XspecT models."""
22

33
from enum import Enum
44

src/xspect/pipeline.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
""" Module for defining the Pipeline class. """
1+
"""Module for defining the Pipeline class."""
22

33
import json
44
from pathlib import Path

src/xspect/run.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
""" Module with XspecT global run class, which summarizes individual model results. """
1+
"""Module with XspecT global run class, which summarizes individual model results."""
22

33
import json
44
from pathlib import Path

src/xspect/train.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
)
2323
from xspect.train_filter import (
2424
create_svm,
25-
html_scrap,
2625
extract_and_concatenate,
2726
)
2827

@@ -136,14 +135,10 @@ def train_ncbi(genus: str, svm_step: int = 1):
136135
children_ids = ncbi_children_tree.NCBIChildrenTree(genus).children_ids()
137136
species_dict = ncbi_taxon_metadata.NCBITaxonMetadata(children_ids).get_metadata()
138137

139-
# Get all gcf accessions that have Taxonomy check result OK.
140-
logger.info("Checking ANI data for updates")
141-
ani_gcf = html_scrap.TaxonomyCheck().ani_gcf()
142-
143138
# Look for up to 8 assembly accessions per species.
144139
logger.info("Getting assembly metadata")
145140
all_metadata = ncbi_assembly_metadata.NCBIAssemblyMetadata(
146-
all_metadata=species_dict, ani_gcf=ani_gcf, count=8, contig_n50=10000
141+
all_metadata=species_dict, count=8, contig_n50=10000
147142
)
148143
all_metadata = all_metadata.get_all_metadata()
149144

src/xspect/train_filter/extract_and_concatenate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
""" Module for extracting and concatenating assemblies. """
1+
"""Module for extracting and concatenating assemblies."""
22

33
__author__ = "Berger, Phillip"
44

src/xspect/train_filter/html_scrap.py

-114
This file was deleted.

src/xspect/train_filter/ncbi_api/download_assemblies.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ def download_assemblies(accessions, dir_name, target_folder, zip_file_name):
2323
"""
2424

2525
path = get_xspect_tmp_path() / dir_name / target_folder / zip_file_name
26-
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/genome/accession/{','.join(accessions)}/download"
26+
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{','.join(accessions)}/download"
2727
parameters = {"include_annotation_type": "GENOME_FASTA", "filename": zip_file_name}
2828
os.makedirs(os.path.dirname(path), exist_ok=True)
29-
genome_download = requests.get(api_url, params=parameters, timeout=20)
29+
genome_download = requests.get(api_url, params=parameters, timeout=30)
3030
with open(path, "wb") as f:
3131
f.write(genome_download.content)

src/xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
""" Collects metadata of assemblies from NCBI API """
1+
"""Collects metadata of assemblies from NCBI API"""
22

33
__author__ = "Berger, Phillip"
44

@@ -14,16 +14,14 @@ class NCBIAssemblyMetadata:
1414

1515
_all_metadata: dict
1616
_count: int
17-
_ani_gcf: list
1817
_parameters: dict
1918
_accessions: list[str]
2019
_contig_n50: int
2120
_all_metadata_complete: dict
2221

23-
def __init__(self, all_metadata: dict, ani_gcf: list, count=8, contig_n50=10000):
22+
def __init__(self, all_metadata: dict, count=8, contig_n50=10000):
2423
self._all_metadata = all_metadata
2524
self._count = count
26-
self._ani_gcf = ani_gcf
2725
self._contig_n50 = contig_n50
2826

2927
self._set_parameters()
@@ -72,25 +70,27 @@ def _set_parameters(self):
7270
}
7371

7472
def _make_request(self, taxon: str):
75-
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/genome/taxon/{taxon}"
73+
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{taxon}/dataset_report"
7674
accessions = []
7775
count = 0
7876
for request_type, parameters in self._parameters.items():
7977
raw_response = requests.get(api_url, params=parameters, timeout=5)
8078
response = raw_response.json()
8179
if response:
8280
try:
83-
assemblies = response["assemblies"]
84-
for assembly in assemblies:
85-
curr_assembly = assembly["assembly"]
86-
curr_accession = curr_assembly["assembly_accession"]
87-
curr_contig_n50 = curr_assembly["contig_n50"]
81+
reports = response["reports"]
82+
for report in reports:
83+
accession = report["accession"]
84+
contig_n50 = report["assembly_stats"]["contig_n50"]
85+
taxonomy_check_status = report["average_nucleotide_identity"][
86+
"taxonomy_check_status"
87+
]
8888
if count < self._count:
8989
if (
90-
curr_accession in self._ani_gcf
91-
and curr_contig_n50 > self._contig_n50
90+
taxonomy_check_status == "OK"
91+
and contig_n50 > self._contig_n50
9292
):
93-
accessions.append(curr_accession)
93+
accessions.append(accession)
9494
count += 1
9595
else:
9696
break

src/xspect/train_filter/ncbi_api/ncbi_children_tree.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __init__(self, taxon: str):
2424

2525
def _request_tree(self):
2626
"""Make the request for the children tree at the NCBI Datasets API."""
27-
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/{self._taxon}/filtered_subtree"
27+
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{self._taxon}/filtered_subtree"
2828
raw_response = requests.get(api_url, timeout=5)
2929
self._response = raw_response.json()["edges"]
3030
self._parent_taxon_id = str(self._response["1"]["visible_children"][0])

src/xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
""" This module is used to retrieve metadata from the NCBI taxonomy database. """
1+
"""This module is used to retrieve metadata from the NCBI taxonomy database."""
22

33
__author__ = "Berger, Phillip"
44

@@ -21,7 +21,7 @@ def __init__(self, taxon: list[str]):
2121
self._collect_all_metadata()
2222

2323
def _request_metadata(self):
24-
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/{str(self._taxon)}"
24+
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{str(self._taxon)}"
2525
raw_response = requests.get(api_url, timeout=5)
2626
self._response = raw_response.json()["taxonomy_nodes"]
2727

tests/test_probabilistic_filter_svm_model.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
""" Tests for the ProbabilisticFilterSVMModel class. """
1+
"""Tests for the ProbabilisticFilterSVMModel class."""
22

33
# pylint: disable=redefined-outer-name
44

0 commit comments

Comments
 (0)