Merge pull request #15 from BIONF/dev

aromberg · web-flow · commit e5b283fa13f5 · 2025-03-14T15:10:27.000+01:00
Upgrade NCBI Datasets API to v2 &amp; Refactor Taxonomy Check
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "XspecT"
-version = "0.2.6"
+version = "0.2.7"
 description = "Tool to monitor and characterize pathogens using Bloom filters."
 readme = {file = "README.md", content-type = "text/markdown"}
 license = {file = "LICENSE"}
diff --git a/src/xspect/mlst_feature/mlst_helper.py b/src/xspect/mlst_feature/mlst_helper.py
@@ -1,4 +1,4 @@
-""" Module for utility functions used in other modules regarding MLST. """
+"""Module for utility functions used in other modules regarding MLST."""
 
 __author__ = "Cetin, Oemer"
 
diff --git a/src/xspect/models/result.py b/src/xspect/models/result.py
@@ -1,4 +1,4 @@
-""" Module for storing the results of XspecT models. """
+"""Module for storing the results of XspecT models."""
 
 from enum import Enum
 
diff --git a/src/xspect/pipeline.py b/src/xspect/pipeline.py
@@ -1,4 +1,4 @@
-""" Module for defining the Pipeline class. """
+"""Module for defining the Pipeline class."""
 
 import json
 from pathlib import Path
diff --git a/src/xspect/run.py b/src/xspect/run.py
@@ -1,4 +1,4 @@
-""" Module with XspecT global run class, which summarizes individual model results. """
+"""Module with XspecT global run class, which summarizes individual model results."""
 
 import json
 from pathlib import Path
diff --git a/src/xspect/train.py b/src/xspect/train.py
@@ -22,7 +22,6 @@
 )
 from xspect.train_filter import (
     create_svm,
-    html_scrap,
     extract_and_concatenate,
 )
 
@@ -136,14 +135,10 @@ def train_ncbi(genus: str, svm_step: int = 1):
     children_ids = ncbi_children_tree.NCBIChildrenTree(genus).children_ids()
     species_dict = ncbi_taxon_metadata.NCBITaxonMetadata(children_ids).get_metadata()
 
-    # Get all gcf accessions that have Taxonomy check result OK.
-    logger.info("Checking ANI data for updates")
-    ani_gcf = html_scrap.TaxonomyCheck().ani_gcf()
-
     # Look for up to 8 assembly accessions per species.
     logger.info("Getting assembly metadata")
     all_metadata = ncbi_assembly_metadata.NCBIAssemblyMetadata(
-        all_metadata=species_dict, ani_gcf=ani_gcf, count=8, contig_n50=10000
+        all_metadata=species_dict, count=8, contig_n50=10000
     )
     all_metadata = all_metadata.get_all_metadata()
 
diff --git a/src/xspect/train_filter/extract_and_concatenate.py b/src/xspect/train_filter/extract_and_concatenate.py
@@ -1,4 +1,4 @@
-""" Module for extracting and concatenating assemblies. """
+"""Module for extracting and concatenating assemblies."""
 
 __author__ = "Berger, Phillip"
 
diff --git a/src/xspect/train_filter/html_scrap.py b/src/xspect/train_filter/html_scrap.py
diff --git a/src/xspect/train_filter/ncbi_api/download_assemblies.py b/src/xspect/train_filter/ncbi_api/download_assemblies.py
@@ -23,9 +23,9 @@ def download_assemblies(accessions, dir_name, target_folder, zip_file_name):
     """
 
     path = get_xspect_tmp_path() / dir_name / target_folder / zip_file_name
-    api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/genome/accession/{','.join(accessions)}/download"
+    api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{','.join(accessions)}/download"
     parameters = {"include_annotation_type": "GENOME_FASTA", "filename": zip_file_name}
     os.makedirs(os.path.dirname(path), exist_ok=True)
-    genome_download = requests.get(api_url, params=parameters, timeout=20)
+    genome_download = requests.get(api_url, params=parameters, timeout=30)
     with open(path, "wb") as f:
         f.write(genome_download.content)
diff --git a/src/xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py b/src/xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py
@@ -1,4 +1,4 @@
-""" Collects metadata of assemblies from NCBI API """
+"""Collects metadata of assemblies from NCBI API"""
 
 __author__ = "Berger, Phillip"
 
@@ -14,16 +14,14 @@ class NCBIAssemblyMetadata:
 
     _all_metadata: dict
     _count: int
-    _ani_gcf: list
     _parameters: dict
     _accessions: list[str]
     _contig_n50: int
     _all_metadata_complete: dict
 
-    def __init__(self, all_metadata: dict, ani_gcf: list, count=8, contig_n50=10000):
+    def __init__(self, all_metadata: dict, count=8, contig_n50=10000):
         self._all_metadata = all_metadata
         self._count = count
-        self._ani_gcf = ani_gcf
         self._contig_n50 = contig_n50
 
         self._set_parameters()
@@ -72,25 +70,27 @@ def _set_parameters(self):
         }
 
     def _make_request(self, taxon: str):
-        api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/genome/taxon/{taxon}"
+        api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{taxon}/dataset_report"
         accessions = []
         count = 0
         for request_type, parameters in self._parameters.items():
             raw_response = requests.get(api_url, params=parameters, timeout=5)
             response = raw_response.json()
             if response:
                 try:
-                    assemblies = response["assemblies"]
-                    for assembly in assemblies:
-                        curr_assembly = assembly["assembly"]
-                        curr_accession = curr_assembly["assembly_accession"]
-                        curr_contig_n50 = curr_assembly["contig_n50"]
+                    reports = response["reports"]
+                    for report in reports:
+                        accession = report["accession"]
+                        contig_n50 = report["assembly_stats"]["contig_n50"]
+                        taxonomy_check_status = report["average_nucleotide_identity"][
+                            "taxonomy_check_status"
+                        ]
                         if count < self._count:
                             if (
-                                curr_accession in self._ani_gcf
-                                and curr_contig_n50 > self._contig_n50
+                                taxonomy_check_status == "OK"
+                                and contig_n50 > self._contig_n50
                             ):
-                                accessions.append(curr_accession)
+                                accessions.append(accession)
                                 count += 1
                         else:
                             break
diff --git a/src/xspect/train_filter/ncbi_api/ncbi_children_tree.py b/src/xspect/train_filter/ncbi_api/ncbi_children_tree.py
@@ -24,7 +24,7 @@ def __init__(self, taxon: str):
 
     def _request_tree(self):
         """Make the request for the children tree at the NCBI Datasets API."""
-        api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/{self._taxon}/filtered_subtree"
+        api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{self._taxon}/filtered_subtree"
         raw_response = requests.get(api_url, timeout=5)
         self._response = raw_response.json()["edges"]
         self._parent_taxon_id = str(self._response["1"]["visible_children"][0])
diff --git a/src/xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py b/src/xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py
@@ -1,4 +1,4 @@
-""" This module is used to retrieve metadata from the NCBI taxonomy database. """
+"""This module is used to retrieve metadata from the NCBI taxonomy database."""
 
 __author__ = "Berger, Phillip"
 
@@ -21,7 +21,7 @@ def __init__(self, taxon: list[str]):
         self._collect_all_metadata()
 
     def _request_metadata(self):
-        api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/{str(self._taxon)}"
+        api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{str(self._taxon)}"
         raw_response = requests.get(api_url, timeout=5)
         self._response = raw_response.json()["taxonomy_nodes"]
 
diff --git a/tests/test_probabilistic_filter_svm_model.py b/tests/test_probabilistic_filter_svm_model.py
@@ -1,4 +1,4 @@
-""" Tests for the ProbabilisticFilterSVMModel class. """
+"""Tests for the ProbabilisticFilterSVMModel class."""
 
 # pylint: disable=redefined-outer-name
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" Module for utility functions used in other modules regarding MLST. """`
	`1`	`+"""Module for utility functions used in other modules regarding MLST."""`
`2`	`2`
`3`	`3`	`__author__ = "Cetin, Oemer"`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" Module for storing the results of XspecT models. """`
	`1`	`+"""Module for storing the results of XspecT models."""`
`2`	`2`
`3`	`3`	`from enum import Enum`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" Module for defining the Pipeline class. """`
	`1`	`+"""Module for defining the Pipeline class."""`
`2`	`2`
`3`	`3`	`import json`
`4`	`4`	`from pathlib import Path`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" Module with XspecT global run class, which summarizes individual model results. """`
	`1`	`+"""Module with XspecT global run class, which summarizes individual model results."""`
`2`	`2`
`3`	`3`	`import json`
`4`	`4`	`from pathlib import Path`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" Module for extracting and concatenating assemblies. """`
	`1`	`+"""Module for extracting and concatenating assemblies."""`
`2`	`2`
`3`	`3`	`__author__ = "Berger, Phillip"`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" Tests for the ProbabilisticFilterSVMModel class. """`
	`1`	`+"""Tests for the ProbabilisticFilterSVMModel class."""`
`2`	`2`
`3`	`3`	`# pylint: disable=redefined-outer-name`
`4`	`4`