Skip to content

Commit cf60c85

Browse files
authored
Fix Clinical Data NA Count Bug with Cross-Study Attribute Level Conflict Resolution (#11603)
* Categorize attributes, map patients to samples on conflicts * Add tests
1 parent 6b9c393 commit cf60c85

File tree

11 files changed

+501
-59
lines changed

11 files changed

+501
-59
lines changed

src/main/java/org/cbioportal/domain/clinical_data/repository/ClinicalDataRepository.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,13 @@ List<ClinicalData> getSampleClinicalData(
3535
* filtered attributes.
3636
*
3737
* @param studyViewFilterContext The filter criteria for the study view.
38-
* @param filteredAttributes A list of attributes to filter the clinical data.
38+
* @param sampleAttributeIds A list of sample prioritized attributes to filter the clinical data.
39+
* @param patientAttributeIds A list of patient attributes to filter the clinical data.
3940
* @return A list of {@link ClinicalDataCountItem} representing clinical data counts.
4041
*/
4142
List<ClinicalDataCountItem> getClinicalDataCounts(
42-
StudyViewFilterContext studyViewFilterContext, List<String> filteredAttributes);
43+
StudyViewFilterContext studyViewFilterContext,
44+
List<String> sampleAttributeIds,
45+
List<String> patientAttributeIds,
46+
List<String> conflictingAttributeIds);
4347
}

src/main/java/org/cbioportal/domain/clinical_data/usecase/GetClinicalDataCountsUseCase.java

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
import java.util.List;
44
import org.cbioportal.domain.clinical_attributes.usecase.GetClinicalAttributesForStudiesUseCase;
55
import org.cbioportal.domain.clinical_data.repository.ClinicalDataRepository;
6+
import org.cbioportal.domain.clinical_data.util.ClinicalAttributeUtil;
7+
import org.cbioportal.domain.clinical_data.util.ClinicalAttributeUtil.CategorizedClinicalAttributeIds;
68
import org.cbioportal.domain.patient.usecase.GetFilteredPatientCountUseCase;
79
import org.cbioportal.domain.sample.usecase.GetFilteredSamplesCountUseCase;
810
import org.cbioportal.domain.studyview.StudyViewFilterContext;
11+
import org.cbioportal.legacy.model.ClinicalAttribute;
912
import org.cbioportal.legacy.model.ClinicalDataCountItem;
1013
import org.cbioportal.legacy.service.util.StudyViewColumnarServiceUtil;
1114
import org.springframework.context.annotation.Profile;
@@ -46,21 +49,42 @@ public GetClinicalDataCountsUseCase(
4649
}
4750

4851
/**
49-
* Executes the use case to retrieve and process clinical data counts. It normalizes the data
52+
* Executes the use case to retrieve and process clinical data counts with unified strategy. For
53+
* attributes that exist at both sample and patient levels across studies, this method applies a
54+
* sample-priority strategy to ensure consistent semantic interpretation. It normalizes the data
5055
* counts and ensures that missing attributes are restored.
5156
*
5257
* @param studyViewFilterContext the context of the study view filter to apply
5358
* @param filteredAttributes a list of filtered clinical attribute IDs
5459
* @return a list of {@link ClinicalDataCountItem} containing the normalized and complete clinical
55-
* data counts
60+
* data counts with unified sample/patient level strategy
5661
*/
5762
public List<ClinicalDataCountItem> execute(
5863
StudyViewFilterContext studyViewFilterContext, List<String> filteredAttributes) {
5964

6065
List<String> involvedCancerStudies = studyViewFilterContext.customDataFilterCancerStudies();
6166

67+
// Get all clinical attributes for the involved studies to determine attribute levels
68+
List<ClinicalAttribute> clinicalAttributes =
69+
getClinicalAttributesForStudiesUseCase.execute(involvedCancerStudies);
70+
71+
List<ClinicalAttribute> filteredClinicalAttributes =
72+
clinicalAttributes.stream()
73+
.filter(attr -> filteredAttributes.contains(attr.getAttrId()))
74+
.toList();
75+
76+
CategorizedClinicalAttributeIds categorizedAttributeIds =
77+
ClinicalAttributeUtil.categorizeClinicalAttributes(filteredClinicalAttributes);
78+
List<String> sampleAttributeIds = categorizedAttributeIds.sampleAttributeIds();
79+
List<String> patientAttributeIds = categorizedAttributeIds.patientAttributeIds();
80+
List<String> conflictingAttributeIds = categorizedAttributeIds.conflictingAttributeIds();
81+
6282
var result =
63-
clinicalDataRepository.getClinicalDataCounts(studyViewFilterContext, filteredAttributes);
83+
clinicalDataRepository.getClinicalDataCounts(
84+
studyViewFilterContext,
85+
sampleAttributeIds,
86+
patientAttributeIds,
87+
conflictingAttributeIds);
6488

6589
// Normalize data counts so that values like TRUE, True, and true are all merged in one count
6690
result.forEach(
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
package org.cbioportal.domain.clinical_data.util;
2+
3+
import java.util.HashSet;
4+
import java.util.List;
5+
import java.util.Map;
6+
import java.util.Set;
7+
import java.util.stream.Collectors;
8+
import org.cbioportal.legacy.model.ClinicalAttribute;
9+
10+
public abstract class ClinicalAttributeUtil {
11+
private ClinicalAttributeUtil() {}
12+
13+
/**
14+
* Categorizes clinical attributes into sample, patient, and conflicting categories based on their
15+
* attribute IDs and types across different studies.
16+
*
17+
* @param clinicalAttributes List of clinical attributes to categorize
18+
* @return CategorizedClinicalAttributes containing the categorized attribute IDs
19+
*/
20+
public static CategorizedClinicalAttributeIds categorizeClinicalAttributes(
21+
List<ClinicalAttribute> clinicalAttributes) {
22+
23+
// Group attributes by attribute ID
24+
Map<String, List<ClinicalAttribute>> clinicalAttributesById =
25+
clinicalAttributes.stream().collect(Collectors.groupingBy(ClinicalAttribute::getAttrId));
26+
27+
Set<String> sampleAttributeIds = new HashSet<>();
28+
Set<String> patientAttributeIds = new HashSet<>();
29+
Set<String> conflictingAttributeIds = new HashSet<>();
30+
31+
// For each attribute ID, check if it's sample, patient, or conflicting
32+
for (Map.Entry<String, List<ClinicalAttribute>> entry : clinicalAttributesById.entrySet()) {
33+
String attrId = entry.getKey();
34+
List<ClinicalAttribute> attributes = entry.getValue();
35+
36+
boolean isSampleAttribute = false;
37+
boolean isPatientAttribute = false;
38+
39+
for (ClinicalAttribute attribute : attributes) {
40+
if (Boolean.TRUE.equals(attribute.getPatientAttribute())) {
41+
isPatientAttribute = true;
42+
} else {
43+
isSampleAttribute = true;
44+
}
45+
}
46+
47+
// Categorize based on the flags
48+
if (isSampleAttribute && isPatientAttribute) {
49+
conflictingAttributeIds.add(attrId);
50+
} else if (isPatientAttribute) {
51+
patientAttributeIds.add(attrId);
52+
} else {
53+
sampleAttributeIds.add(attrId);
54+
}
55+
}
56+
57+
return new CategorizedClinicalAttributeIds(
58+
sampleAttributeIds.stream().toList(),
59+
patientAttributeIds.stream().toList(),
60+
conflictingAttributeIds.stream().toList());
61+
}
62+
63+
/** Record containing categorized clinical attribute IDs */
64+
public record CategorizedClinicalAttributeIds(
65+
List<String> sampleAttributeIds,
66+
List<String> patientAttributeIds,
67+
List<String> conflictingAttributeIds) {}
68+
}

src/main/java/org/cbioportal/infrastructure/repository/clickhouse/clinical_data/ClickhouseClinicalDataMapper.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,16 @@ public interface ClickhouseClinicalDataMapper {
1616
* filtered attribute values.
1717
*
1818
* @param studyViewFilterContext the context of the study view filter
19-
* @param attributeIds the list of attribute IDs to filter by
20-
* @param filteredAttributeValues the list of filtered attribute values
19+
* @param sampleAttributeIds the list of sample attribute IDs to filter by
20+
* @param patientAttributeIds the list of patient attribute IDs to filter by
21+
* @param conflictingAttributeIds the list of both sample and patient attribute IDs to filter by
2122
* @return a list of clinical data count items
2223
*/
2324
List<ClinicalDataCountItem> getClinicalDataCounts(
2425
StudyViewFilterContext studyViewFilterContext,
25-
List<String> attributeIds,
26-
List<String> filteredAttributeValues);
26+
List<String> sampleAttributeIds,
27+
List<String> patientAttributeIds,
28+
List<String> conflictingAttributeIds);
2729

2830
/**
2931
* Retrieves sample clinical data based on the study view filter context and attribute IDs.

src/main/java/org/cbioportal/infrastructure/repository/clickhouse/clinical_data/ClickhouseClinicalDataRepository.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package org.cbioportal.infrastructure.repository.clickhouse.clinical_data;
22

3-
import java.util.Collections;
43
import java.util.List;
54
import org.cbioportal.domain.clinical_data.repository.ClinicalDataRepository;
65
import org.cbioportal.domain.studyview.StudyViewFilterContext;
@@ -13,8 +12,6 @@
1312
@Profile("clickhouse")
1413
public class ClickhouseClinicalDataRepository implements ClinicalDataRepository {
1514

16-
private static final List<String> FILTERED_CLINICAL_ATTR_VALUES = Collections.emptyList();
17-
1815
private final ClickhouseClinicalDataMapper mapper;
1916

2017
public ClickhouseClinicalDataRepository(ClickhouseClinicalDataMapper mapper) {
@@ -37,8 +34,11 @@ public List<ClinicalData> getSampleClinicalData(
3734

3835
@Override
3936
public List<ClinicalDataCountItem> getClinicalDataCounts(
40-
StudyViewFilterContext studyViewFilterContext, List<String> filteredAttributes) {
37+
StudyViewFilterContext studyViewFilterContext,
38+
List<String> sampleAttributeIds,
39+
List<String> patientAttributeIds,
40+
List<String> conflictingAttributeIds) {
4141
return mapper.getClinicalDataCounts(
42-
studyViewFilterContext, filteredAttributes, FILTERED_CLINICAL_ATTR_VALUES);
42+
studyViewFilterContext, sampleAttributeIds, patientAttributeIds, conflictingAttributeIds);
4343
}
4444
}

src/main/resources/mappers/clickhouse/clinical_data/ClickhouseClinicalDataMapper.xml

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
<?xml version="1.0" encoding="UTF-8"?>
22
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
33

4-
<mapper
5-
namespace="org.cbioportal.infrastructure.repository.clickhouse.clinical_data.ClickhouseClinicalDataMapper">
4+
<mapper namespace="org.cbioportal.infrastructure.repository.clickhouse.clinical_data.ClickhouseClinicalDataMapper">
65

76
<select id="getSampleClinicalDataFromStudyViewFilter" resultType="org.cbioportal.legacy.model.ClinicalData">
87
SELECT
@@ -47,13 +46,33 @@
4746

4847
<!-- for /clinical-data-counts/fetch (returns ClinicalData) which will then be converted to clinicalDataCountItems -->
4948
<select id="getClinicalDataCounts" resultMap="ClinicalDataCountItemResultMap">
50-
<include refid="getClinicalDataCountsQuery">
51-
<property name="type" value="sample"/>
52-
</include>
53-
UNION ALL
54-
<include refid="getClinicalDataCountsQuery">
55-
<property name="type" value="patient"/>
56-
</include>
49+
<if test="sampleAttributeIds != null and !sampleAttributeIds.isEmpty()">
50+
<include refid="getClinicalDataCountsQuery">
51+
<property name="type" value="sample"/>
52+
<property name="attributeIds" value="sampleAttributeIds"/>
53+
<property name="isConflicting" value="false"/>
54+
</include>
55+
</if>
56+
57+
<if test="patientAttributeIds != null and !patientAttributeIds.isEmpty()">
58+
<if test="sampleAttributeIds != null and !sampleAttributeIds.isEmpty()">UNION ALL</if>
59+
<include refid="getClinicalDataCountsQuery">
60+
<property name="type" value="patient"/>
61+
<property name="attributeIds" value="patientAttributeIds"/>
62+
<property name="isConflicting" value="false"/>
63+
</include>
64+
</if>
65+
66+
<if test="conflictingAttributeIds != null and !conflictingAttributeIds.isEmpty()">
67+
<if test="(sampleAttributeIds != null and !sampleAttributeIds.isEmpty()) or (patientAttributeIds != null and !patientAttributeIds.isEmpty())">
68+
UNION ALL
69+
</if>
70+
<include refid="getClinicalDataCountsQuery">
71+
<property name="type" value="patient"/>
72+
<property name="attributeIds" value="conflictingAttributeIds"/>
73+
<property name="isConflicting" value="true"/>
74+
</include>
75+
</if>
5776
</select>
5877

5978
<sql id="getClinicalDataCountsQuery">
@@ -63,9 +82,12 @@
6382
attribute_name AS attributeId,
6483
attribute_value AS value,
6584
cast(count(*) AS INTEGER) as count
66-
FROM clinical_data_derived
85+
FROM clinical_data_derived cdd
86+
<if test="'${isConflicting}' == 'true'">
87+
<!-- JOIN patient data with sample table to map patient-level attributes to sample-level counts -->
88+
LEFT JOIN sample_derived sd ON cdd.patient_unique_id = sd.patient_unique_id
89+
</if>
6790
<where>
68-
type='${type}'
6991
AND <!-- Table creation in clickhouse.sql has ensured no NA values but extra caution is always appreciated -->
7092
<include refid="org.cbioportal.infrastructure.repository.clickhouse.studyview.ClickhouseStudyViewFilterMapper.normalizeAttributeValue">
7193
<property name="attribute_value" value="value"/>
@@ -81,7 +103,7 @@
81103
</otherwise>
82104
</choose>
83105
AND attribute_name IN
84-
<foreach item="attributeId" collection="attributeIds" open="(" separator="," close=")">
106+
<foreach item="attributeId" collection="${attributeIds}" open="(" separator="," close=")">
85107
#{attributeId}
86108
</foreach>
87109
</where>
@@ -94,13 +116,14 @@
94116
'NA' AS value,
95117
((
96118
<choose>
97-
<when test="'${type}' == 'sample'">
98-
<include
99-
refid="org.cbioportal.infrastructure.repository.clickhouse.sample.ClickhouseSampleMapper.getFilteredSampleCount"/>
119+
<when test="'${type}' == 'sample' or '${isConflicting}' == 'true'">
120+
<!-- Use sample count for:
121+
1. Pure sample attributes (type='sample')
122+
2. Conflicting attributes (already mapped to sample-level via JOIN) -->
123+
<include refid="org.cbioportal.infrastructure.repository.clickhouse.sample.ClickhouseSampleMapper.getFilteredSampleCount"/>
100124
</when>
101125
<otherwise>
102-
<include
103-
refid="org.cbioportal.infrastructure.repository.clickhouse.patient.ClickhousePatientMapper.getFilteredPatientCount"/>
126+
<include refid="org.cbioportal.infrastructure.repository.clickhouse.patient.ClickhousePatientMapper.getFilteredPatientCount"/>
104127
</otherwise>
105128
</choose>
106129
) - clinical_data_sum.sum) AS count
@@ -111,7 +134,6 @@
111134
)
112135
</sql>
113136

114-
115137
<resultMap id="ClinicalDataCountItemResultMap" type="org.cbioportal.legacy.model.ClinicalDataCountItem">
116138
<result property="attributeId" column="attributeId"/>
117139
<collection property="counts" ofType="org.cbioportal.legacy.model.ClinicalDataCount">

0 commit comments

Comments
 (0)