Skip to content

Commit c5075ea

Browse files
authored
feat(wmg): census-based wmg builder - snapshot v3 (#6020)
1 parent e4f18d5 commit c5075ea

File tree

117 files changed

+1872
-4719
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+1872
-4719
lines changed

Dockerfile.wmg_pipeline

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ ADD backend/wmg/__init__.py backend/wmg/__init__.py
1919
ADD backend/wmg/config.py backend/wmg/config.py
2020
ADD backend/wmg/data backend/wmg/data
2121
ADD backend/wmg/pipeline backend/wmg/pipeline
22+
ADD backend/wmg/api backend/wmg/api
23+
ADD backend/cellguide/pipeline backend/cellguide/pipeline
2224
ADD backend/layers backend/layers
2325
ADD backend/common backend/common
2426

@@ -29,4 +31,4 @@ LABEL commit=${HAPPY_COMMIT}
2931
ENV COMMIT_SHA=${HAPPY_COMMIT}
3032
ENV COMMIT_BRANCH=${HAPPY_BRANCH}
3133

32-
CMD ["python3", "-m", "backend.wmg.pipeline.cube_pipeline"]
34+
CMD ["python3", "-m", "backend.wmg.pipeline"]

backend/cellguide/pipeline/computational_marker_genes/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import logging
22

3-
from backend.cellguide.pipeline.computational_marker_genes.computational_markers import MarkerGenesCalculator
4-
from backend.cellguide.pipeline.computational_marker_genes.constants import MARKER_SCORE_THRESHOLD
3+
from backend.cellguide.pipeline.computational_marker_genes.computational_markers import (
4+
MARKER_SCORE_THRESHOLD,
5+
MarkerGenesCalculator,
6+
)
57
from backend.cellguide.pipeline.constants import COMPUTATIONAL_MARKER_GENES_FOLDERNAME, MARKER_GENE_PRESENCE_FILENAME
68
from backend.cellguide.pipeline.ontology_tree import get_ontology_tree_builder
79
from backend.cellguide.pipeline.ontology_tree.tree_builder import OntologyTreeBuilder

backend/cellguide/pipeline/computational_marker_genes/computational_markers.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,6 @@
1010
from dask.diagnostics import ProgressBar
1111
from tqdm import tqdm
1212

13-
from backend.cellguide.pipeline.computational_marker_genes.constants import (
14-
MARKER_SCORE_THRESHOLD,
15-
)
1613
from backend.cellguide.pipeline.computational_marker_genes.types import ComputationalMarkerGenes
1714
from backend.cellguide.pipeline.computational_marker_genes.utils import (
1815
bootstrap_rows_percentiles,
@@ -43,6 +40,8 @@
4340
or any arbitrary combinations of metadata dimensions.
4441
"""
4542

43+
MARKER_SCORE_THRESHOLD = 0.5
44+
4645

4746
class MarkerGenesCalculator:
4847
def __init__(self, *, snapshot: WmgSnapshot, all_cell_type_ids_in_corpus: list[str], groupby_terms: list[str]):

backend/cellguide/pipeline/computational_marker_genes/constants.py

Lines changed: 0 additions & 1 deletion
This file was deleted.

backend/cellguide/pipeline/source_collections/source_collections_generator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
from backend.cellguide.pipeline.canonical_marker_genes.utils import format_citation_dp
22
from backend.cellguide.pipeline.source_collections.types import SourceCollectionsData
33
from backend.common.utils.rollup import descendants
4-
from backend.wmg.data.utils import get_collections_from_curation_api, get_datasets_from_curation_api
4+
from backend.wmg.data.utils import get_collections_from_discover_api, get_datasets_from_discover_api
55

66

77
def generate_source_collections_data(all_cell_type_ids_in_corpus: list[str]) -> dict[str, list[SourceCollectionsData]]:
88
"""
99
For each cell type id in the corpus, we want to generate a SourceCollectionsData object, which contains
1010
metadata about the source data for each cell type
1111
"""
12-
all_datasets = get_datasets_from_curation_api()
13-
all_collections = get_collections_from_curation_api()
12+
all_datasets = get_datasets_from_discover_api()
13+
all_collections = get_collections_from_discover_api()
1414

1515
collections_dict = {collection["collection_id"]: collection for collection in all_collections}
1616
datasets_dict = {dataset["dataset_id"]: dataset for dataset in all_datasets}

backend/common/utils/exceptions.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,5 @@ class NonExistentDatasetException(CorporaException):
2323
pass
2424

2525

26-
class CubeValidationException(Exception):
27-
pass
28-
29-
3026
class MarkerGeneCalculationException(Exception):
3127
pass

backend/common/utils/result_notification.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def gen_wmg_pipeline_failure_message(failure_info: str) -> dict:
8585
}
8686

8787

88-
def gen_wmg_pipeline_success_message(snapshot_path: str, dataset_count: int, cell_count: int, gene_count: int) -> dict:
88+
def gen_wmg_pipeline_success_message(snapshot_path: str, dataset_count: int, cell_count: int) -> dict:
8989
return {
9090
"blocks": [
9191
{
@@ -101,8 +101,7 @@ def gen_wmg_pipeline_success_message(snapshot_path: str, dataset_count: int, cel
101101
"text": {
102102
"type": "mrkdwn",
103103
"text": f"\n* WMG snapshot stored in {snapshot_path}"
104-
f"\n* The cube contains {cell_count} cells from {dataset_count} "
105-
f"\n datasets, with expression scores across {gene_count} genes.",
104+
f"\n* The cube contains {cell_count} cells from {dataset_count} datasets.",
106105
},
107106
},
108107
]

backend/scripts/wmg_query_examples.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from backend.wmg.data import query
1212
from backend.wmg.data.snapshot import (
1313
EXPRESSION_SUMMARY_CUBE_NAME,
14-
EXPRESSION_SUMMARY_FMG_CUBE_NAME,
1514
WmgSnapshot,
1615
_open_cube,
1716
)
@@ -27,20 +26,9 @@ def load_snapshot(snapshot_id) -> WmgSnapshot:
2726
cube = _open_cube(
2827
f's3://cellxgene-wmg-{os.environ["DEPLOYMENT_STAGE"]}/{snapshot_id}/{EXPRESSION_SUMMARY_CUBE_NAME}/'
2928
)
30-
cube_fmg = _open_cube(
31-
f's3://cellxgene-wmg-{os.environ["DEPLOYMENT_STAGE"]}/{snapshot_id}/{EXPRESSION_SUMMARY_FMG_CUBE_NAME}/'
32-
)
3329
return WmgSnapshot(
3430
snapshot_identifier=snapshot_id,
3531
expression_summary_cube=cube,
36-
cell_counts_cube=None,
37-
cell_type_orderings=pd.DataFrame(),
38-
primary_filter_dimensions=pd.DataFrame(),
39-
expression_summary_fmg_cube=cube_fmg,
40-
dataset_to_gene_ids={},
41-
marker_genes_cube=None,
42-
filter_relationships=None,
43-
dataset_metadata=None,
4432
)
4533

4634

backend/wmg/data/constants.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,3 @@
1-
# Minimum number of expressed genes for a cell to be included in the corpus.
2-
# See the following document for further details:
3-
# https://github.com/chanzuckerberg/cellxgene-documentation/blob/main/scExpression/scExpression-documentation.md#removal-of-low-coverage-cells
4-
GENE_EXPRESSION_COUNT_MIN_THRESHOLD = 500
5-
6-
# Minimum value for raw expression counts that will be used to filter out computed RankIt values. Details:
7-
# https://github.com/chanzuckerberg/cellxgene-documentation/blob/main/scExpression/scExpression-documentation.md#removal-of-noisy-ultra-low-expression-values
8-
RANKIT_RAW_EXPR_COUNT_FILTERING_MIN_THRESHOLD = 2
9-
101
# wmg only includes data generated by assays that normalize for gene length
112
INCLUDED_ASSAYS = {
123
"EFO:0010550": "sci-RNA-seq",

backend/wmg/data/query.py

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,6 @@ class WmgFiltersQueryCriteria(BaseModel):
5151
publication_citations: List[str] = Field(default=[], unique_items=True, min_items=0)
5252

5353

54-
class FmgQueryCriteria(BaseModel):
55-
organism_ontology_term_id: str # required!
56-
tissue_ontology_term_ids: List[str] = Field(default=[], unique_items=True, min_items=0)
57-
cell_type_ontology_term_ids: List[str] = Field(default=[], unique_items=True, min_items=0, max_items=1)
58-
59-
6054
class MarkerGeneQueryCriteria(BaseModel):
6155
organism_ontology_term_id: str # required!
6256
tissue_ontology_term_id: str # required!
@@ -102,19 +96,13 @@ def expression_summary_default(self, criteria: WmgQueryCriteria) -> DataFrame:
10296
criteria=criteria,
10397
)
10498

105-
def expression_summary_fmg(self, criteria: FmgQueryCriteria) -> DataFrame:
106-
return self._query(
107-
cube=self._snapshot.expression_summary_fmg_cube,
108-
criteria=criteria,
109-
)
110-
11199
def marker_genes(self, criteria: MarkerGeneQueryCriteria) -> DataFrame:
112100
return self._query(
113101
cube=self._snapshot.marker_genes_cube,
114102
criteria=criteria,
115103
)
116104

117-
def cell_counts(self, criteria: Union[WmgQueryCriteria, FmgQueryCriteria], compare_dimension=None) -> DataFrame:
105+
def cell_counts(self, criteria: WmgQueryCriteria, compare_dimension=None) -> DataFrame:
118106
cell_counts = self._query(
119107
cube=self._snapshot.cell_counts_cube,
120108
criteria=criteria.copy(exclude={"gene_ontology_term_ids"}),
@@ -128,7 +116,7 @@ def cell_counts(self, criteria: Union[WmgQueryCriteria, FmgQueryCriteria], compa
128116
def _query(
129117
self,
130118
cube: Array,
131-
criteria: Union[WmgQueryCriteria, WmgQueryCriteriaV2, FmgQueryCriteria, MarkerGeneQueryCriteria],
119+
criteria: Union[WmgQueryCriteria, WmgQueryCriteriaV2, MarkerGeneQueryCriteria],
132120
compare_dimension=None,
133121
) -> DataFrame:
134122
indexed_dims = self._cube_query_params.get_indexed_dims_to_lookup_query_criteria(
@@ -165,12 +153,6 @@ def _query(
165153
attrs = self._cube_query_params.get_attrs_for_cube_query(cube)
166154
if compare_dimension is not None:
167155
attrs.append(compare_dimension)
168-
if (
169-
isinstance(criteria, FmgQueryCriteria)
170-
and compare_dimension != "dataset_id"
171-
and "dataset_id" in [i.name for i in cube.schema]
172-
):
173-
attrs.append("dataset_id")
174156

175157
attrs += numeric_attrs
176158

backend/wmg/data/rankit.py

Lines changed: 0 additions & 49 deletions
This file was deleted.

0 commit comments

Comments
 (0)