nf-core
diff --git a/‎CITATIONS.md
Lines changed: 7 additions & 17 deletions b/‎CITATIONS.md
Lines changed: 7 additions & 17 deletions
diff --git a/‎bin/clustal2fasta.py
Lines changed: 0 additions & 31 deletions b/‎bin/clustal2fasta.py
Lines changed: 0 additions & 31 deletions
diff --git a/‎bin/clustal2phylip.py
Lines changed: 0 additions & 31 deletions b/‎bin/clustal2phylip.py
Lines changed: 0 additions & 31 deletions
diff --git a/‎bin/csv_adorn.py
Lines changed: 4 additions & 3 deletions b/‎bin/csv_adorn.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎bin/ensembl2uniprot.py
Lines changed: 3 additions & 3 deletions b/‎bin/ensembl2uniprot.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎bin/fetch_afdb_structures.py
Lines changed: 0 additions & 58 deletions b/‎bin/fetch_afdb_structures.py
Lines changed: 0 additions & 58 deletions
diff --git a/‎bin/fetch_ensembl_idmap.py
Lines changed: 2 additions & 0 deletions b/‎bin/fetch_ensembl_idmap.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎bin/fetch_ensembl_sequences.py
Lines changed: 4 additions & 0 deletions b/‎bin/fetch_ensembl_sequences.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎bin/fetch_inspector_group.py
Lines changed: 3 additions & 3 deletions b/‎bin/fetch_inspector_group.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎bin/fetch_oma_by_sequence.py
Lines changed: 2 additions & 0 deletions b/‎bin/fetch_oma_by_sequence.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎bin/fetch_oma_group.py
Lines changed: 2 additions & 3 deletions b/‎bin/fetch_oma_group.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎bin/fetch_oma_groupid.py
Lines changed: 2 additions & 3 deletions b/‎bin/fetch_oma_groupid.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎bin/fetch_oma_sequences.py
Lines changed: 4 additions & 3 deletions b/‎bin/fetch_oma_sequences.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎bin/fetch_oma_taxid_by_id.py
Lines changed: 2 additions & 0 deletions b/‎bin/fetch_oma_taxid_by_id.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎bin/fetch_panther_group.py
Lines changed: 2 additions & 3 deletions b/‎bin/fetch_panther_group.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎bin/fetch_refseq_sequences.py
Lines changed: 7 additions & 0 deletions b/‎bin/fetch_refseq_sequences.py
Lines changed: 7 additions & 0 deletions
@@ -34,31 +34,21 @@
 
 > Huang H, McGarvey PB, Suzek BE, Mazumder R, Zhang J, Chen Y, Wu CH. A comprehensive protein-centric ID mapping service for molecular data integration. Bioinformatics. 2011 Apr 15;27(8):1190-1. doi: 10.1093/bioinformatics/btr101. PMID: 21478197; PMCID: PMC3072559.
 
-- [AlphaFold](https://deepmind.google/technologies/alphafold)
+- [Diamond](https://github.com/bbuchfink/diamond)
 
-> Jumper, J., Evans, R., Pritzel, A. et al. Highly accurate protein structure prediction with AlphaFold. Nature 596, 583–589 (2021). https://doi.org/10.1038/s41586-021-03819-2
+> Buchfink B, Reuter K, Drost HG, "Sensitive protein alignments at tree-of-life scale using DIAMOND", Nature Methods 18, 366–368 (2021). doi:10.1038/s41592-021-01101-x
 
-- [AlphaFold Database](https://alphafold.ebi.ac.uk)
+- [RefSeq](https://www.ncbi.nlm.nih.gov/refseq/)
 
-> Mihaly Varadi, Stephen Anyango, Mandar Deshpande, Sreenath Nair, Cindy Natassia, Galabina Yordanova, David Yuan, Oana Stroe, Gemma Wood, Agata Laydon, Augustin Žídek, Tim Green, Kathryn Tunyasuvunakool, Stig Petersen, John Jumper, Ellen Clancy, Richard Green, Ankur Vora, Mira Lutfi, Michael Figurnov, Andrew Cowie, Nicole Hobbs, Pushmeet Kohli, Gerard Kleywegt, Ewan Birney, Demis Hassabis, Sameer Velankar, AlphaFold Protein Structure Database: massively expanding the structural coverage of protein-sequence space with high-accuracy models, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D439–D444, https://doi.org/10.1093/nar/gkab1061
+> O'Leary NA, Wright MW, Brister JR, Ciufo S, Haddad D, McVeigh R, Rajput B, Robbertse B, Smith-White B, Ako-Adjei D, Astashyn A, Badretdin A, Bao Y, Blinkova O, Brover V, Chetvernin V, Choi J, Cox E, Ermolaeva O, Farrell CM, Goldfarb T, Gupta T, Haft D, Hatcher E, Hlavina W, Joardar VS, Kodali VK, Li W, Maglott D, Masterson P, McGarvey KM, Murphy MR, O'Neill K, Pujar S, Rangwala SH, Rausch D, Riddick LD, Schoch C, Shkeda A, Storz SS, Sun H, Thibaud-Nissen F, Tolstoy I, Tully RE, Vatsan AR, Wallin C, Webb D, Wu W, Landrum MJ, Kimchi A, Tatusova T, DiCuccio M, Kitts P, Murphy TD, Pruitt KD. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res. 2016 Jan 4;44(D1):D733-45
 
-- [T-COFFEE](https://tcoffee.org)
+- [Ensembl](https://www.ensembl.org)
 
-> Notredame C, Higgins DG, Heringa J. T-Coffee: A novel method for fast and accurate multiple sequence alignment. J Mol Biol. 2000 Sep 8;302(1):205-17. doi: 10.1006/jmbi.2000.4042. PMID: 10964570.
-
-- [IQTREE](https://iqtree.org)
-
-> B.Q. Minh, H.A. Schmidt, O. Chernomor, D. Schrempf, M.D. Woodhams, A. von Haeseler, R. Lanfear (2020) IQ-TREE 2: New models and efficient methods for phylogenetic inference in the genomic era. Mol. Biol. Evol., 37:1530-1534. https://doi.org/10.1093/molbev/msaa015
-
-> D.T. Hoang, O. Chernomor, A. von Haeseler, B.Q. Minh, L.S. Vinh (2018) UFBoot2: Improving the ultrafast bootstrap approximation. Mol. Biol. Evol., 35:518–522. https://doi.org/10.1093/molbev/msx281
-
-- [FastME](https://atgc-montpellier.fr/fastme/)
-
-> Vincent Lefort, Richard Desper, Olivier Gascuel, FastME 2.0: A Comprehensive, Accurate, and Fast Distance-Based Phylogeny Inference Program, Molecular Biology and Evolution, Volume 32, Issue 10, October 2015, Pages 2798–2800, https://doi.org/10.1093/molbev/msv150
+> Sarah C Dyer, Olanrewaju Austine-Orimoloye, Andrey G Azov, Matthieu Barba, If Barnes, Vianey Paola Barrera-Enriquez, Arne Becker, Ruth Bennett, Martin Beracochea, Andrew Berry, Jyothish Bhai, Simarpreet Kaur Bhurji, Sanjay Boddu, Paulo R Branco Lins, Lucy Brooks, Shashank Budhanuru Ramaraju, Lahcen I Campbell, Manuel Carbajo Martinez, Mehrnaz Charkhchi, Lucas A Cortes, Claire Davidson, Sukanya Denni, Kamalkumar Dodiya, Sarah Donaldson, Bilal El Houdaigui, Tamara El Naboulsi, Oluwadamilare Falola, Reham Fatima, Thiago Genez, Jose Gonzalez Martinez, Tatiana Gurbich, Matthew Hardy, Zoe Hollis, Toby Hunt, Mike Kay, Vinay Kaykala, Diana Lemos, Disha Lodha, Nourhen Mathlouthi, Gabriela Alejandra Merino, Ryan Merritt, Louisse Paola Mirabueno, Aleena Mushtaq, Syed Nakib Hossain, José G Pérez-Silva, Malcolm Perry, Ivana Piližota, Daniel Poppleton, Irina Prosovetskaia, Shriya Raj, Ahamed Imran Abdul Salam, Shradha Saraf, Nuno Saraiva-Agostinho, Swati Sinha, Botond Sipos, Vasily Sitnik, Emily Steed, Marie-Marthe Suner, Likhitha Surapaneni, Kyösti Sutinen, Francesca Floriana Tricomi, Ian Tsang, David Urbina-Gómez, Andres Veidenberg, Thomas A Walsh, Natalie L Willhoft, Jamie Allen, Jorge Alvarez-Jarreta, Marc Chakiachvili, Jitender Cheema, Jorge Batista da Rocha, Nishadi H De Silva, Stefano Giorgetti, Leanne Haggerty, Garth R Ilsley, Jon Keatley, Jane E Loveland, Benjamin Moore, Jonathan M Mudge, Guy Naamati, John Tate, Stephen J Trevanion, Andrea Winterbottom, Bethany Flint, Adam Frankish, Sarah E Hunt, Robert D Finn, Mallory A Freeberg, Peter W Harrison, Fergal J Martin, and Andrew D Yates. Ensembl 2025. Nucleic Acids Res. 2025, 53(D1):D948–D957. PMID: 39656687
 
 - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
 
-  > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
+> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
 
 ## Software packaging/containerisation tools
 
 
@@ -3,13 +3,14 @@
 # Written by Igor Trujnara, released under the MIT license
 # See https://opensource.org/license/mit for details
 
+"""Convert a list of IDs into a CSV file with a header.
+
+This is required for csv merge to work."""
+
 import sys
 
 
 def csv_adorn(path: str, header: str) -> None:
-    """
-    Convert a list of IDs into a CSV file with a header. Used for later table merge.
-    """
     print(f"id,{header}")
     with open(path) as f:
         any_data = False
 
@@ -3,15 +3,15 @@
 # Written by Igor Trujnara, released under the MIT license
 # See https://opensource.org/license/mit for details
 
+"""Convert Ensembl IDs to UniProt IDs using the UniProt mapping API."""
+
 import sys
 
 from utils import check_id_mapping_results_ready, safe_get, safe_post
 
 
 def ensembl2uniprot(ensembl_ids: list[str]) -> list[str]:
-    """
-    Convert a list of Ensembl IDs to UniProt IDs using the UniProt mapping API.
-    """
+    """Convert a list of Ensembl IDs to UniProt IDs using the UniProt mapping API."""
     if len(ensembl_ids) == 0:
         return []
 
 
@@ -3,6 +3,8 @@
 # Written by Igor Trujnara, released under the MIT license
 # See https://opensource.org/license/mit for details
 
+"""Fetch Ensembl species identifiers and their NCBI taxon IDs from the Ensembl API."""
+
 import requests
 
 
 
@@ -3,12 +3,15 @@
 # Written by Igor Trujnara, released under the MIT license
 # See https://opensource.org/license/mit for details
 
+"""Fetch protein sequences from Ensembl using the Ensembl REST API."""
+
 import csv
 import sys
 
 from utils import list_to_file, safe_post, SequenceInfo, split_ids
 
 def fetch_slice(ids: list[str], idmap: dict[str,str]) -> list[SequenceInfo]:
+    """Fetch taxon IDs and sequences for given protein IDs from Ensembl."""
     hits = {}
     # fetch taxon information
     payload = {"ids": ids}
@@ -43,6 +46,7 @@ def fetch_slice(ids: list[str], idmap: dict[str,str]) -> list[SequenceInfo]:
 
 
 def fetch_ensembl(ids: list[str], idmap_path: str) -> list[SequenceInfo]:
+    """Fetch taxon IDs and sequences for given protein IDs from Ensembl in slices of 100."""
     taxon_map = {}
     with open(idmap_path) as f:
         for it in csv.reader(f):
 
@@ -3,15 +3,15 @@
 # Written by Igor Trujnara, released under the MIT license
 # See https://opensource.org/license/mit for details
 
+"""Fetch orthologs for a given UniProt ID from the OrthoInspector database."""
+
 import sys
 
 from utils import safe_get
 
 
 def fetch_inspector_by_id(uniprot_id: str, db_id: str = "Eukaryota2019") -> None:
-    """
-    Fetch orthologs for a given UniProt ID from the OrthoInspector database.
-    """
+    """Fetch orthologs for a given UniProt ID from the OrthoInspector database."""
     url = f"https://lbgi.fr/api/orthoinspector/{db_id}/protein/{uniprot_id}/orthologs"
     res = safe_get(url)
 
 
@@ -3,6 +3,8 @@
 # Written by Igor Trujnara, released under the MIT license
 # See https://opensource.org/license/mit for details
 
+"""Fetch OMA entry for a given protein sequence from the OMA browser API."""
+
 import sys
 from warnings import warn
 
 
@@ -3,15 +3,14 @@
 # Written by Igor Trujnara, released under the MIT license
 # See https://opensource.org/license/mit for details
 
+"""Fetch members of an OMA group by ID."""
+
 import sys
 from warnings import warn
 from utils import safe_get
 
 
 def main() -> None:
-    """
-    Fetch members of an OMA group by ID.
-    """
     if len(sys.argv) < 2:
         raise ValueError("Too few arguments. Usage: fetch_oma_group_by_id.py <id>")
 
 
@@ -3,16 +3,15 @@
 # Written by Igor Trujnara, released under the MIT license
 # See https://opensource.org/license/mit for details
 
+"""Get OMA group ID from a UniProt ID."""
+
 import sys
 from warnings import warn
 
 from utils import safe_get
 
 
 def main() -> None:
-    """
-    Get OMA group ID from a UniProt ID.
-    """
     if len(sys.argv) < 2:
         raise ValueError("Not enough arguments. Usage: fetch_oma_groupid.py <filename>")
 
 
@@ -3,15 +3,15 @@
 # Written by Igor Trujnara, released under the MIT license
 # See https://opensource.org/license/mit for details
 
+"""Fetch protein sequences from the OMA database using the OMA REST API."""
+
 import sys
 
 from utils import list_to_file, safe_post, SequenceInfo, split_ids
 
 
 def fetch_slice(ids: list[str]) -> list[SequenceInfo]:
-    """
-    Fetch sequences for given UniProt IDs from the OMA database.
-    """
+    """Fetch sequences for given UniProt IDs from the OMA database."""
     payload = {"ids": ids}
 
     res = safe_post("https://omabrowser.org/api/protein/bulk_retrieve/", json=payload)
@@ -31,6 +31,7 @@ def fetch_slice(ids: list[str]) -> list[SequenceInfo]:
 
 
 def fetch_seqs_oma(ids: list[str]) -> list[SequenceInfo]:
+    """Fetch sequences for given UniProt IDs from the OMA database in slices of 100."""
     seqs = []
     for s in split_ids(ids, 100):
         seqs = seqs + fetch_slice(s)
 
@@ -3,6 +3,8 @@
 # Written by Igor Trujnara, released under the MIT license
 # See https://opensource.org/license/mit for details
 
+"""Fetch OMA taxon ID by UniProt ID."""
+
 import sys
 from warnings import warn
 
 
@@ -3,16 +3,15 @@
 # Written by Igor Trujnara, released under the MIT license
 # See https://opensource.org/license/mit for details
 
+"""Fetch members of a Panther group by ID."""
+
 import sys
 from warnings import warn
 
 from utils import safe_get
 
 
 def main() -> None:
-    """
-    Fetch members of a Panther group by ID.
-    """
     if len(sys.argv) < 3:
         raise ValueError("Too few arguments. Usage: fetch_panther_group.py <id> <organism>")
 
 
@@ -3,6 +3,8 @@
 # Written by Igor Trujnara, released under the MIT license
 # See https://opensource.org/license/mit for details
 
+"""Fetch protein sequences from the RefSeq database using the NCBI eutils API."""
+
 import sys
 from xml.dom import minidom
 
@@ -11,21 +13,25 @@
 
 
 def get_taxid(node: minidom.Element) -> str:
+    """Extract the taxid from the XML object."""
     taxid = node.getElementsByTagName("TSeq_taxid")[0].firstChild.wholeText
     return taxid
 
 
 def get_sequence(node: minidom.Element) -> str:
+    """Extract the sequence from the XML object."""
     seq = node.getElementsByTagName("TSeq_sequence")[0].firstChild.wholeText
     return seq
 
 
 def get_prot_id(node: minidom.Element) -> str:
+    """Extract the protein ID from the XML object."""
     prot_id = node.getElementsByTagName("TSeq_accver")[0].firstChild.wholeText.split(".")[0]
     return prot_id
 
 
 def fetch_slice(ids: list[str], db: str = "protein") -> list[SequenceInfo]:
+    """Fetch sequences for given protein IDs from the RefSeq database."""
     id_string = ",".join(ids)
     fasta = Entrez.efetch(db=db, id=id_string, rettype="fasta", retmode="xml")
     seqs = minidom.parse(fasta).getElementsByTagName("TSeq")
@@ -35,6 +41,7 @@ def fetch_slice(ids: list[str], db: str = "protein") -> list[SequenceInfo]:
 
 
 def fetch_sequences(ids: list[str], db: str = "protein") -> list[SequenceInfo]:
+    """Fetch sequences for given protein IDs from the RefSeq database in slices of 100."""
     seqs = []
     for s in split_ids(ids, 100):
         seqs += fetch_slice(s, db)