diff --git a/donor_completeness.patch b/donor_completeness.patch new file mode 100644 index 00000000..f77c1c83 --- /dev/null +++ b/donor_completeness.patch @@ -0,0 +1,1052 @@ +diff --git a/src/clinical_etl/CSVConvert.py b/src/clinical_etl/CSVConvert.py +index 2aeb879..e51d760 100644 +--- a/src/clinical_etl/CSVConvert.py ++++ b/src/clinical_etl/CSVConvert.py +@@ -635,6 +635,101 @@ def load_manifest(manifest_file): + return result + + ++def summarize_completeness(donor_completeness): ++ """Aggregate per-donor completeness into ID-free counts. ++ ++ Produces two independent partitions of all donors: ++ * minimal: tier_a_min_clinical_complete + tier_b_min_clinical_complete ++ + incomplete_min_donors ++ * fulsome: tier_a_full_clinical_complete + tier_b_full_clinical_complete ++ + incomplete_full_donors ++ A donor is counted in a tier bucket only if it meets that tier AND the ++ relevant completeness level; everything else (wrong/absent tier, or not ++ complete) falls into the matching incomplete bucket. Tier assignment is ++ exclusive, so a Tier A donor is never counted toward a Tier B bucket.""" ++ summary = { ++ "total_donors": len(donor_completeness), ++ "tier_a_min_clinical_complete": 0, ++ "tier_b_min_clinical_complete": 0, ++ "incomplete_min_donors": 0, ++ "tier_a_full_clinical_complete": 0, ++ "tier_b_full_clinical_complete": 0, ++ "incomplete_full_donors": 0, ++ } ++ for rec in donor_completeness.values(): ++ tier = rec["tier"] ++ # minimal partition ++ if tier == "A" and rec["minimal_complete"]: ++ summary["tier_a_min_clinical_complete"] += 1 ++ elif tier == "B" and rec["minimal_complete"]: ++ summary["tier_b_min_clinical_complete"] += 1 ++ else: ++ summary["incomplete_min_donors"] += 1 ++ # fulsome partition ++ if tier == "A" and rec["fulsome_complete"]: ++ summary["tier_a_full_clinical_complete"] += 1 ++ elif tier == "B" and rec["fulsome_complete"]: ++ summary["tier_b_full_clinical_complete"] += 1 ++ else: ++ summary["incomplete_full_donors"] += 1 ++ return summary ++ ++ ++def build_completeness_failures(donor_completeness, tier_criteria=None): ++ """Build a detailed per-donor report of every donor that is not fully ++ (tier + fulsome) complete, with the reasons it failed. ++ ++ A donor is considered failing unless it is assigned a tier (A or B) AND is ++ fulsome complete. For each failing donor the report lists the offending ++ sample composition and/or the specific unmet minimal and fulsome fields.""" ++ def _tier_requirement_text(): ++ if not tier_criteria: ++ return "any tier" ++ parts = [] ++ for tier, crit in tier_criteria.items(): ++ desc = ", ".join(f"{n} {kind}" for kind, n in crit.items()) ++ parts.append(f"Tier {tier} ({desc})") ++ return " or ".join(parts) ++ ++ failing = [] ++ for donor_id, rec in donor_completeness.items(): ++ tiered = rec["tier"] in ("A", "B") ++ if tiered and rec["fulsome_complete"]: ++ continue # fully complete -> not a failure ++ ++ reasons = [] ++ if not tiered: ++ reasons.append( ++ f"Sample composition does not satisfy {_tier_requirement_text()}; " ++ f"found {rec['sample_counts'] or 'no classifiable tumour/normal DNA/RNA samples'}" ++ ) ++ if not rec["minimal_complete"]: ++ reasons.append( ++ f"Fails minimal clinical completeness: {len(rec['minimal_unmet'])} field(s) missing" ++ ) ++ if not rec["fulsome_complete"]: ++ reasons.append( ++ f"Fails fulsome clinical completeness: {len(rec['fulsome_unmet'])} " ++ f"required/conditionally-required field(s) missing" ++ ) ++ failing.append({ ++ "donor_id": donor_id, ++ "tier": rec["tier"], ++ "minimal_complete": rec["minimal_complete"], ++ "fulsome_complete": rec["fulsome_complete"], ++ "reasons": reasons, ++ "sample_counts": rec["sample_counts"], ++ "minimal_unmet": rec["minimal_unmet"], ++ "fulsome_unmet": rec["fulsome_unmet"], ++ }) ++ ++ return { ++ "total_donors": len(donor_completeness), ++ "failing_donors": len(failing), ++ "donors": failing, ++ } ++ ++ + def csv_convert(input_path, manifest_file, minify=False, index_output=False, verbose=False): + mappings.VERBOSE = verbose + # read manifest data +@@ -756,9 +851,24 @@ def csv_convert(input_path, manifest_file, minify=False, index_output=False, ver + schema.validate_ingest_map(result) + validation_results = {"validation_errors": schema.validation_errors, + "validation_warnings": schema.validation_warnings, +- "cases_missing_data": schema.statistics["cases_missing_data"]} ++ "cases_missing_data": schema.statistics["cases_missing_data"], ++ "donor_completeness": schema.statistics.get("donor_completeness", {})} + result["statistics"] = schema.statistics + result["statistics"].pop("cases_missing_data") # remove donor IDs from _map.json file ++ # per-donor completeness is keyed by donor ID: keep it out of _map.json too, ++ # but retain an aggregate tier/level summary (no IDs) in the statistics. ++ donor_completeness = result["statistics"].pop("donor_completeness", {}) ++ result["statistics"]["completeness_summary"] = summarize_completeness(donor_completeness) ++ # write a detailed per-donor completeness failure report (contains donor IDs, ++ # so it is kept out of _map.json, like the validation results) ++ if donor_completeness: ++ completeness_failures = build_completeness_failures( ++ donor_completeness, getattr(schema, "tier_criteria", None)) ++ with open(f"{input_path}_completeness_failures.json", 'w') as f: ++ json.dump(completeness_failures, f, indent=4) ++ print(f"{Bcolors.OKGREEN}Completeness failure report (" ++ f"{completeness_failures['failing_donors']}/{completeness_failures['total_donors']} " ++ f"donors) written to {input_path}_completeness_failures.json{Bcolors.ENDC}") + + # write ingestion and validation json files + print(f"{Bcolors.OKGREEN}Saving packets to file.{Bcolors.ENDC}") +diff --git a/src/clinical_etl/completeness_table.py b/src/clinical_etl/completeness_table.py +index f315509..f356893 100644 +--- a/src/clinical_etl/completeness_table.py ++++ b/src/clinical_etl/completeness_table.py +@@ -27,7 +27,39 @@ def generate_csv(input_path): + out.write(f"{k},{field},{total},{missing},{round(missing_percent)}\n") + + ++def generate_donor_completeness_csv(input_path): ++ """Write a per-donor tier/level completeness table from a ++ *_validation_results.json file (which holds the donor-ID-keyed records).""" ++ output_path = input_path.replace("_validation_results.json", "_donor_completeness.csv") ++ print(f"Converting {input_path} to {output_path}") ++ with open(input_path) as f: ++ donors = json.load(f).get("donor_completeness", {}) ++ with open(output_path, "w") as out: ++ out.write("Donor,Tier,Level,Type,Minimal Complete,Fulsome Complete,Unmet (fulsome)\n") ++ for donor_id, rec in donors.items(): ++ out.write( ++ f"{donor_id},{rec['tier'] or ''},{rec['level']},{rec['type']}," ++ f"{rec['minimal_complete']},{rec['fulsome_complete']}," ++ f"{'|'.join(rec['fulsome_unmet'])}\n" ++ ) ++ ++ ++def main(input_path): ++ """Dispatch on file type: aggregate field stats from a _map.json, or the ++ per-donor tier/level table from a _validation_results.json.""" ++ with open(input_path) as f: ++ data = json.load(f) ++ if "donor_completeness" in data: ++ generate_donor_completeness_csv(input_path) ++ elif "statistics" in data: ++ generate_csv(input_path) ++ else: ++ raise SystemExit( ++ "Input json has neither 'statistics' (a _map.json) nor " ++ "'donor_completeness' (a _validation_results.json)." ++ ) ++ ++ + if __name__ == "__main__": + args = parse_args() +- input_path = args.input +- generate_csv(input_path) ++ main(args.input) +diff --git a/src/clinical_etl/mohschemav3.py b/src/clinical_etl/mohschemav3.py +index 2b691a1..09e1fa9 100644 +--- a/src/clinical_etl/mohschemav3.py ++++ b/src/clinical_etl/mohschemav3.py +@@ -163,6 +163,64 @@ class MoHSchemaV3(BaseSchema): + } + } + ++ # ------------------------------------------------------------------ # ++ # Per-donor completeness criteria (consumed by BaseSchema engine) # ++ # ------------------------------------------------------------------ # ++ ++ # Tier = sample_registration composition. Ordered strongest-first so that ++ # a donor satisfying both is assigned the higher tier (A) and is therefore ++ # NOT also counted in the Tier B total. Criteria are cumulative: Tier A's ++ # required samples are a superset of Tier B's. ++ tier_criteria = { ++ "A": {"tumour_dna": 1, "tumour_rna": 1, "normal_dna": 1}, ++ "B": {"tumour_dna": 1, "normal_dna": 1}, ++ } ++ ++ # Minimal completeness: reduced field set that must hold valid values on ++ # every existing instance of each object type. ++ minimal_criteria = { ++ "donors": ["gender", "sex_at_birth", "date_of_birth", "date_resolution"], ++ "primary_diagnoses": ["date_of_diagnosis", "cancer_type_code", "primary_site", "basis_of_diagnosis"], ++ "specimens": ["specimen_collection_date", "specimen_anatomic_location"], ++ "sample_registrations": ["specimen_tissue_source", "tumour_normal_designation", "specimen_type", "sample_type"], ++ } ++ ++ # Nested objects every donor must have for 'fulsome' completeness. Counted ++ # anywhere in the donor tree (e.g. treatments live under primary_diagnoses). ++ required_instances = [ ++ {"key": "treatments", "min": 1}, ++ ] ++ ++ # Conditionally-required fields are NOT re-listed here. 'fulsome' completeness ++ # is derived directly from the validation pass: every conditional requirement ++ # in the validate_* methods raises warn(..., conditional_required=True), and ++ # those warnings are attributed per-donor and fed into the fulsome check ++ # (see BaseSchema._evaluate_fulsome). Soft notes / consistency warnings are ++ # marked conditional_required=False so they don't affect completeness. ++ ++ @staticmethod ++ def _sample_kind(sample): ++ """Classify a sample_registration as e.g. 'tumour_dna' / 'normal_dna'. ++ ++ ASSUMPTION: the molecule (DNA vs RNA) is read from `sample_type`. ++ If the MoH model encodes it in a different field, change ONLY this ++ method (e.g. read 'specimen_type' or an analyte field instead).""" ++ designation = (sample.get("tumour_normal_designation") or "").lower() ++ sample_type = (sample.get("sample_type") or "").lower() ++ if "rna" in sample_type: ++ molecule = "rna" ++ elif "dna" in sample_type: ++ molecule = "dna" ++ else: ++ molecule = None ++ if "tumour" in designation or "tumor" in designation: ++ tn = "tumour" ++ elif "normal" in designation: ++ tn = "normal" ++ else: ++ tn = None ++ return f"{tn}_{molecule}" if (tn and molecule) else None ++ + def validate_donors(self, map_json): + for prop in map_json: + match prop: +@@ -181,7 +239,8 @@ class MoHSchemaV3(BaseSchema): + if map_json["lost_to_followup_reason"] is not None: + if "lost_to_followup_after_clinical_event_identifier" not in map_json: + self.warn( +- "lost_to_followup_reason should only be submitted if lost_to_followup_after_clinical_event_identifier is submitted") ++ "lost_to_followup_reason should only be submitted if lost_to_followup_after_clinical_event_identifier is submitted", ++ conditional_required=False) + case "date_alive_after_lost_to_followup": + if map_json["date_alive_after_lost_to_followup"] is not None: + if "lost_to_followup_after_clinical_event_identifier" not in map_json: +@@ -239,7 +298,7 @@ class MoHSchemaV3(BaseSchema): + if ('diagnosis_date' in locals() and diagnosis_date not in [None, ''] and + treatment_end not in [None, ''] and 'treatment_end' in locals() and + treatment_end < diagnosis_date): +- self.warn(f"{diagnosis['submitter_primary_diagnosis_id']} > {treatment['submitter_treatment_id']}: date_of_diagnosis should be earlier than treatment_end_date ") ++ self.warn(f"{diagnosis['submitter_primary_diagnosis_id']} > {treatment['submitter_treatment_id']}: date_of_diagnosis should be earlier than treatment_end_date ", conditional_required=False) + if 'treatment_start' in locals() and treatment_start not in [None, '']: + if 'death' in locals() and death not in [None, ''] and treatment_start > death: + self.fail( +@@ -247,12 +306,12 @@ class MoHSchemaV3(BaseSchema): + if 'birth' in locals() and birth not in [None, ''] and treatment_start < birth and treatment_start is not None: + self.fail(f"{diagnosis['submitter_primary_diagnosis_id']} > {treatment['submitter_treatment_id']}: treatment_start_date cannot be before date_of_birth") + if 'diagnosis_date' in locals() and diagnosis_date not in [None, ''] and treatment_start < diagnosis_date: +- self.warn(f"{diagnosis['submitter_primary_diagnosis_id']} > {treatment['submitter_treatment_id']}: treatment_start_date should not be before date_of_diagnosis") ++ self.warn(f"{diagnosis['submitter_primary_diagnosis_id']} > {treatment['submitter_treatment_id']}: treatment_start_date should not be before date_of_diagnosis", conditional_required=False) + diagnosis_values_list = list(diagnoses_dates.values()) + if (len(diagnosis_values_list) > 0 and "int" in str(type(diagnosis_values_list[0])) and + 0 not in diagnosis_values_list): + self.warn(f"Earliest primary_diagnosis.date_of_diagnosis.month_interval should be 0, current " +- f"month_intervals: {diagnoses_dates}") ++ f"month_intervals: {diagnoses_dates}", conditional_required=False) + case "date_of_death": + if map_json["date_of_death"] is not None: + if map_json["is_deceased"] in ["No", "Not available"]: +@@ -286,7 +345,7 @@ class MoHSchemaV3(BaseSchema): + + def validate_primary_diagnoses(self, map_json): + if map_json["date_of_diagnosis"] is None: +- self.warn("NOTE: cannot calculate any date intervals for this patient without date_of_diagnosis") ++ self.warn("NOTE: cannot calculate any date intervals for this patient without date_of_diagnosis", conditional_required=False) + if "clinical_tumour_staging_system" not in map_json and "pathological_tumour_staging_system" not in map_json: + self.warn("Either clinical_tumour_staging_system or pathological_staging_system is required") + for prop in map_json: +diff --git a/src/clinical_etl/schema.py b/src/clinical_etl/schema.py +index 1609eb8..2ed0fc0 100644 +--- a/src/clinical_etl/schema.py ++++ b/src/clinical_etl/schema.py +@@ -49,6 +49,13 @@ class BaseSchema: + # The component name in the OpenAPI specification + schema_name = None + ++ # Values that count as "empty" for per-donor completeness scoring. ++ # NOTE: "Not available" is intentionally NOT included: it is treated as a ++ # valid, complete answer for completeness purposes. (This differs from the ++ # required_but_missing / cases_missing_data stats in validate_schema, which ++ # still treat "Not available" as missing.) ++ EMPTY_VALUES = (None, "") ++ + # schema for validation beyond jsonschema checks. Each schema that is described in the model gets an entry. + validation_schema = { + "examples": { # There should be a method `validate_examples` implemented to validate conditionals +@@ -115,7 +122,15 @@ class BaseSchema: + self.template = self.add_default_mappings(raw_template) + + +- def warn(self, message): ++ def warn(self, message, conditional_required=True): ++ """Record a validation warning. ++ ++ `conditional_required` (default True) marks the warning as indicating a ++ required or conditionally-required field/object that is missing, so it ++ counts against per-donor 'fulsome' completeness. Set it False for soft ++ notes and cross-field consistency warnings that are not about a missing ++ requirement. The warning is attributed to the current donor via ++ stack_location[0] so the completeness engine can look it up.""" + prefix = " > ".join(self.stack_location) + if prefix.strip() == "": + prefix = "" +@@ -123,6 +138,11 @@ class BaseSchema: + prefix += ": " + message = prefix + message + self.validation_warnings.append(f"{message}") ++ if conditional_required and self.stack_location: ++ donor = self.stack_location[0] ++ if not hasattr(self, "_conditional_gaps"): ++ self._conditional_gaps = {} ++ self._conditional_gaps.setdefault(donor, []).append(message) + + + def fail(self, message): +@@ -324,6 +344,8 @@ class BaseSchema: + self.statistics["required_but_missing"] = {} + self.statistics["schemas_used"] = [] + self.statistics["cases_missing_data"] = [] ++ self.statistics["donor_completeness"] = {} ++ self._conditional_gaps = {} # donor_id -> [conditional-requirement warnings] + + for key in self.validation_schema.keys(): + self.validation_schema[key]["extra_args"] = { +@@ -333,6 +355,9 @@ class BaseSchema: + for x in range(0, len(map_json[root_schema])): + self.validate_jsonschema(map_json[root_schema][x], x) + self.validate_schema(root_schema, map_json[root_schema][x]) ++ record = self.calculate_donor_completeness(map_json[root_schema][x]) ++ if record is not None: ++ self.statistics["donor_completeness"][record["donor_id"]] = record + for schema in self.identifiers: + most_common = self.identifiers[schema].most_common() + if most_common[0][1] > 1: +@@ -411,7 +436,10 @@ class BaseSchema: + } + self.statistics["required_but_missing"][schema_name][f]["total"] += 1 + if f not in map_json or map_json[f] == "Not available": +- self.warn(f"{f} required for {schema_name}") ++ # Flat required-field gaps are handled by the completeness ++ # engine's _required_complete (which, unlike this check, treats ++ # "Not available" as a valid value), so don't double-count here. ++ self.warn(f"{f} required for {schema_name}", conditional_required=False) + self.statistics["required_but_missing"][schema_name][f]["missing"] += 1 + if case not in self.statistics["cases_missing_data"]: + self.statistics["cases_missing_data"].append(case) +@@ -432,3 +460,145 @@ class BaseSchema: + else: + self.validate_schema(ns, map_json[ns]) + self.stack_location.pop() ++ ++ # ------------------------------------------------------------------ # ++ # Per-donor completeness # ++ # ------------------------------------------------------------------ # ++ # Two orthogonal axes per donor: ++ # * tier ("A"/"B"/None) -- driven by sample_registration composition ++ # * level ("fulsome"/"minimal"/"incomplete") -- driven by field validity ++ # A schema subclass opts in by defining `tier_criteria`, `minimal_criteria` ++ # and (optionally) `conditional_fields` plus the `_sample_kind` classifier. ++ # Schemas that don't define these get None (feature disabled for them). ++ ++ def _field_present(self, obj, field): ++ """True if `field` has a non-empty value on `obj`. ++ ++ "Not available" counts as a valid, complete value (see EMPTY_VALUES).""" ++ return isinstance(obj, dict) and field in obj and obj[field] not in self.EMPTY_VALUES ++ ++ def _find_objects(self, node, key): ++ """Return every object appearing under `key` anywhere in the donor tree.""" ++ found = [] ++ if isinstance(node, dict): ++ for k, v in node.items(): ++ if k == key: ++ found.extend(v if isinstance(v, list) else [v]) ++ found.extend(self._find_objects(v, key)) ++ elif isinstance(node, list): ++ for item in node: ++ found.extend(self._find_objects(item, key)) ++ return [o for o in found if isinstance(o, dict)] ++ ++ def _evaluate_tier(self, donor): ++ """Classify a donor's sample composition into a single, exclusive tier. ++ ++ Tier criteria are cumulative (Tier A's samples are a superset of Tier B's), ++ so a donor that qualifies for A also qualifies for B. The returned `tier` ++ resolves this in favour of the highest tier, so a Tier A donor is counted ++ ONLY as A and never toward the Tier B total. The `criteria_met` dict is ++ diagnostic (overlapping) and must not be used for tallying totals.""" ++ samples = self._find_objects(donor, "sample_registrations") ++ counts = {} ++ for s in samples: ++ kind = self._sample_kind(s) ++ if kind: ++ counts[kind] = counts.get(kind, 0) + 1 ++ criteria_met = { ++ tier: all(counts.get(k, 0) >= n for k, n in req.items()) ++ for tier, req in self.tier_criteria.items() ++ } ++ # highest satisfied tier wins; assumes tier_criteria ordered strongest-first ++ tier = next((t for t in self.tier_criteria if criteria_met.get(t)), None) ++ return tier, counts, criteria_met ++ ++ def _evaluate_minimal(self, donor): ++ """Check the reduced 'minimal' field set on every existing instance.""" ++ unmet = [] ++ for schema_name, fields in self.minimal_criteria.items(): ++ instances = [donor] if schema_name == self._root_schema() \ ++ else self._find_objects(donor, schema_name) ++ id_key = self.validation_schema.get(schema_name, {}).get("id") ++ for inst in instances: ++ ident = inst.get(id_key, "?") if id_key else "?" ++ unmet += [f"{schema_name}[{ident}].{f}" ++ for f in fields if not self._field_present(inst, f)] ++ return (len(unmet) == 0), unmet ++ ++ def _required_complete(self, schema_name, obj, unmet, prefix=""): ++ """Recursively check all required_fields across the donor tree.""" ++ spec = self.validation_schema[schema_name] ++ id_key = spec["id"] ++ ident = obj.get(id_key, "?") if id_key else "?" ++ here = f"{prefix}{schema_name}[{ident}]" ++ for f in spec["required_fields"]: ++ if not self._field_present(obj, f): ++ unmet.append(f"{here}.{f}") ++ for ns in spec["nested_schemas"]: ++ for child in (obj.get(ns) or []): ++ self._required_complete(ns, child, unmet, prefix=f"{here} > ") ++ ++ def _evaluate_required_instances(self, donor): ++ """Check that required nested objects exist (e.g. >= 1 treatment). ++ ++ Driven by the optional `required_instances` list on the schema subclass, ++ each entry being {"key": , "min": }. Objects are counted ++ anywhere in the donor tree via _find_objects.""" ++ unmet = [] ++ for spec in getattr(self, "required_instances", []): ++ found = len(self._find_objects(donor, spec["key"])) ++ need = spec.get("min", 1) ++ if found < need: ++ unmet.append( ++ f"missing required object: {spec['key']} (found {found}, need >= {need})") ++ return unmet ++ ++ def _evaluate_fulsome(self, donor, donor_id): ++ """Fulsome = every required field present (across the whole tree) AND ++ every conditionally-required field/object present. ++ ++ Flat required fields are checked directly by _required_complete (which ++ honours "Not available" as a valid value). The conditional requirements ++ are taken from the validation pass itself: every `warn(...)` raised with ++ conditional_required=True during this donor's validation is a missing ++ conditional requirement. This means *all* conditional rules in the ++ validate_* methods are covered automatically and stay in sync as the ++ model evolves -- no rule needs to be re-listed here. ++ ++ NOTE: relies on validate_schema having run for this donor first (it does, ++ in validate_ingest_map, immediately before calculate_donor_completeness).""" ++ unmet = [] ++ self._required_complete(self._root_schema(), donor, unmet) ++ unmet += getattr(self, "_conditional_gaps", {}).get(donor_id, []) ++ unmet += self._evaluate_required_instances(donor) ++ return (len(unmet) == 0), unmet ++ ++ def _root_schema(self): ++ return list(self.validation_schema.keys())[0] ++ ++ def calculate_donor_completeness(self, donor): ++ """Return a per-donor completeness record, or None if this schema does ++ not define completeness criteria.""" ++ if getattr(self, "tier_criteria", None) is None \ ++ or getattr(self, "minimal_criteria", None) is None: ++ return None ++ ++ id_field = self.validation_schema[self._root_schema()]["id"] ++ donor_id = donor.get(id_field) ++ tier, sample_counts, tier_criteria_met = self._evaluate_tier(donor) ++ minimal_ok, minimal_unmet = self._evaluate_minimal(donor) ++ # conditional gaps are keyed by stack_location[0] == str(donor_id) ++ fulsome_ok, fulsome_unmet = self._evaluate_fulsome(donor, str(donor_id)) ++ level = "fulsome" if fulsome_ok else "minimal" if minimal_ok else "incomplete" ++ return { ++ "donor_id": donor_id, ++ "tier": tier, # "A" / "B" / None (exclusive) ++ "level": level, # fulsome / minimal / incomplete ++ "type": (f"Tier {tier} {level}" if tier else f"untiered {level}"), ++ "tier_criteria_met": tier_criteria_met, # diagnostic only (overlapping) ++ "sample_counts": sample_counts, ++ "minimal_complete": minimal_ok, ++ "fulsome_complete": fulsome_ok, ++ "minimal_unmet": minimal_unmet, ++ "fulsome_unmet": fulsome_unmet, ++ } +diff --git a/tests/raw_data/Biomarker.csv b/tests/raw_data/Biomarker.csv +index 3d1fa75..2a70823 100644 +--- a/tests/raw_data/Biomarker.csv ++++ b/tests/raw_data/Biomarker.csv +@@ -11,3 +11,5 @@ DONOR_3,,,TR_3,1/5/2020,,7,327,103,8,Positive,65.8,Not applicable,23.6,Not avail + DONOR_3,,,TR_3,1/5/2020,,7,207,112,9,Positive,73.5,Not available,72.8,Cannot be determined,Not applicable,Not applicable,Negative,, + DONOR_3,,PD_3,,1/5/2020,,6,304,-99,9,,1.3,Negative,15.1,Not available,Not applicable,Not applicable,Positive,HPV16|HPV39, + DONOR_5,,PD_5,,1/5/2020,,4,245,46,11,Cannot be determined,59.9,Not available,-99,Not applicable,Cannot be determined,Negative,Cannot be determined,, ++CMPLT_COV1,,,,1/5/2018,,,5,,,,,,,,,,,, ++CMPLT_COV2,,,,1/5/2018,,,5,,,,,,,,,,,, +diff --git a/tests/raw_data/Comorbidity.csv b/tests/raw_data/Comorbidity.csv +index 08e14a3..07c5c30 100644 +--- a/tests/raw_data/Comorbidity.csv ++++ b/tests/raw_data/Comorbidity.csv +@@ -1,2 +1,4 @@ + submitter_donor_id,prior_malignancy,laterality_of_prior_malignancy,age_at_comorbidity_diagnosis,comorbidity_type_code,comorbidity_treatment_status,comorbidity_treatment, + DONOR_1,Yes,Right,44,C34.9,Not available,Ablation, ++CMPLT_COV1,,,,C34.9,,, ++CMPLT_COV2,,,,C34.9,,, +diff --git a/tests/raw_data/Donor.csv b/tests/raw_data/Donor.csv +index 7fdbde0..4a6b3af 100644 +--- a/tests/raw_data/Donor.csv ++++ b/tests/raw_data/Donor.csv +@@ -5,3 +5,10 @@ DONOR_3,TEST_1,PD_3,Lost contact,4/6/2022,No,,7/12/1945,,Non-binary,Other,month + DONOR_4,TEST_1,,,,Yes,Not available,1/6/1984,239,Man,Male,month + DONOR_5,TEST_2,PD_5,Not available,1/6/2022,Yes,,15/2/1984,,Woman,Female,month + DONOR_6,TEST_2,PD_6,Withdrew from study,1/6/2022,No,,12/9/1974,,Non-binary,Other,month ++CMPLT_AF,TEST_1,,,,No,,6/1/1960,,Woman,Female,month ++CMPLT_BF,TEST_1,,,,No,,6/1/1961,,Man,Male,month ++CMPLT_AM,TEST_1,,,,No,,6/1/1962,,Woman,Female,month ++CMPLT_BM,TEST_1,,,,No,,6/1/1963,,Man,Male,month ++CMPLT_INC,TEST_1,,,,No,,6/1/1964,,Non-binary,Other,month ++CMPLT_COV1,TEST_1,,,,No,,1/1/1970,,Woman,Female,month ++CMPLT_COV2,TEST_1,,,,No,,1/1/1970,,Woman,Female,month +diff --git a/tests/raw_data/Exposure.csv b/tests/raw_data/Exposure.csv +new file mode 100644 +index 0000000..f9832ca +--- /dev/null ++++ b/tests/raw_data/Exposure.csv +@@ -0,0 +1,3 @@ ++submitter_donor_id,tobacco_smoking_status,tobacco_type,pack_years_smoked ++CMPLT_COV1,Lifelong non-smoker (<100 cigarettes smoked in lifetime),, ++CMPLT_COV2,Lifelong non-smoker (<100 cigarettes smoked in lifetime),, +diff --git a/tests/raw_data/Followup.csv b/tests/raw_data/Followup.csv +index 2a946bb..c636ba4 100644 +--- a/tests/raw_data/Followup.csv ++++ b/tests/raw_data/Followup.csv +@@ -5,3 +5,5 @@ FOLLOW_UP_3,DONOR_1,,,01/08/2022,Loco-regional progression,Distant recurrence/me + FOLLOW_UP_4,DONOR_1,,,01/08/2022,Loco-regional progression,Biochemical progression,16-05-2022,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS, + FOLLOW_UP_4,DONOR_6,,,01/07/2022,Loco-regional progression,Biochemical progression,16-05-2022,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS, + DUPLICATE_ID,DONOR_4,,,01/07/2022,Loco-regional progression,Biochemical progression,18-05-2022,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS, ++FU_CMPLT_COV1,CMPLT_COV1,,,1/6/2019,No evidence of disease,,,,,,,,,, ++FU_CMPLT_COV2,CMPLT_COV2,,,1/6/2019,No evidence of disease,,,,,,,,,, +diff --git a/tests/raw_data/PrimaryDiagnosis.csv b/tests/raw_data/PrimaryDiagnosis.csv +index d74a45d..ec168f8 100644 +--- a/tests/raw_data/PrimaryDiagnosis.csv ++++ b/tests/raw_data/PrimaryDiagnosis.csv +@@ -6,4 +6,11 @@ DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Cytology,AJCC cancer staging system,T + DONOR_4,Brain,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (R-ISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS + DONOR_5,Gum,PD_5,15/3/2020,C64.9,,Revised International staging system (R-ISS),T1,N0a,M0,,Left,,,,Stage IIBES + DONOR_6,"Heart, mediastinum, and pleura",PD_6,1/5/2016,C02.2,Specific tumour markers,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified",,,,Stage IIIB +-DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,, +\ No newline at end of file ++DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,, ++CMPLT_AF,Breast,PD_AF,1/6/2018,C50.1,Histology of a primary tumour,Durie-Salmon staging system,,,,Stage I,Left,,,, ++CMPLT_BF,Breast,PD_BF,1/6/2018,C50.1,Histology of a primary tumour,Durie-Salmon staging system,,,,Stage I,Left,,,, ++CMPLT_AM,Breast,PD_AM,1/6/2018,C50.1,Histology of a primary tumour,Durie-Salmon staging system,,,,Stage I,Left,,,, ++CMPLT_BM,Breast,PD_BM,1/6/2018,C50.1,Histology of a primary tumour,Durie-Salmon staging system,,,,Stage I,Left,,,, ++CMPLT_INC,Breast,PD_INC,1/6/2018,C50.1,Histology of a primary tumour,Durie-Salmon staging system,,,,Stage I,Left,,,, ++CMPLT_COV1,Breast,PD_COV1,1/2/2018,C50.1,Histology of a primary tumour,AJCC cancer staging system,T1,N0,M0,,Left,,,, ++CMPLT_COV2,Breast,PD_COV2,1/2/2018,C50.1,Histology of a primary tumour,Durie-Salmon staging system,,,,Stage I,Left,,,, +diff --git a/tests/raw_data/Radiation.csv b/tests/raw_data/Radiation.csv +index e253d36..14aa591 100644 +--- a/tests/raw_data/Radiation.csv ++++ b/tests/raw_data/Radiation.csv +@@ -1,3 +1,5 @@ + submitter_donor_id, submitter_treatment_id, radiation_therapy_modality, radiation_therapy_type, radiation_therapy_fractions, radiation_therapy_dosage, anatomical_site_irradiated, radiation_boost, reference_radiation_treatment_id, + DONOR_5,TR_5, Teleradiotherapy protons (procedure), Internal, 30,-99,FINGER (INCLUDING THUMBS),Yes, REFERENCE_RADIATION_TREATMENT_2, + DONOR_5,TR_5, Teleradiotherapy protons (procedure), Internal, 10,33,FINGER (INCLUDING THUMBS),No,, ++CMPLT_COV1,TR_COV1,Brachytherapy (procedure),External,30,50,ABDOMEN,No,, ++CMPLT_COV2,TR_COV2,Brachytherapy (procedure),External,30,50,ABDOMEN,No,, +diff --git a/tests/raw_data/Sample_Registration.csv b/tests/raw_data/Sample_Registration.csv +index f77fa14..c9d9157 100644 +--- a/tests/raw_data/Sample_Registration.csv ++++ b/tests/raw_data/Sample_Registration.csv +@@ -3,3 +3,19 @@ SAMPLE_REGISTRATION_1,DONOR_2,SPECIMEN_4,Cervical mucus,Tumour,Recurrent tumour, + SAMPLE_REGISTRATION_2,DONOR_2,SPECIMEN_7,Cervical mucus,Normal,Recurrent tumour,Total DNA,Bar + SAMPLE_REGISTRATION_3,DONOR_2,SPECIMEN_5,Cervical mucus,Normal,Recurrent tumour,Total DNA,Baz + SAMPLE_REGISTRATION_4,DONOR_5,SPECIMEN_6,Cervical mucus,Normal,Recurrent tumour,Total DNA,Bat ++SAMP_AF_TD,CMPLT_AF,SPEC_AF_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total DNA,Foo ++SAMP_AF_TR,CMPLT_AF,SPEC_AF_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total RNA,Foo ++SAMP_AF_ND,CMPLT_AF,SPEC_AF_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo ++SAMP_BF_TD,CMPLT_BF,SPEC_BF_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total DNA,Foo ++SAMP_BF_ND,CMPLT_BF,SPEC_BF_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo ++SAMP_AM_TD,CMPLT_AM,SPEC_AM_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total DNA,Foo ++SAMP_AM_TR,CMPLT_AM,SPEC_AM_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total RNA,Foo ++SAMP_AM_ND,CMPLT_AM,SPEC_AM_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo ++SAMP_BM_TD,CMPLT_BM,SPEC_BM_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total DNA,Foo ++SAMP_BM_ND,CMPLT_BM,SPEC_BM_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo ++SAMP_INC_ND,CMPLT_INC,SPEC_INC_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo ++SAMP_COV1_TD,CMPLT_COV1,SPEC_COV1_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total DNA,Foo ++SAMP_COV1_TR,CMPLT_COV1,SPEC_COV1_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total RNA,Foo ++SAMP_COV1_ND,CMPLT_COV1,SPEC_COV1_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo ++SAMP_COV2_TD,CMPLT_COV2,SPEC_COV2_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total DNA,Foo ++SAMP_COV2_ND,CMPLT_COV2,SPEC_COV2_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo +diff --git a/tests/raw_data/Specimen.csv b/tests/raw_data/Specimen.csv +index b208747..1b11340 100644 +--- a/tests/raw_data/Specimen.csv ++++ b/tests/raw_data/Specimen.csv +@@ -6,3 +6,16 @@ DONOR_2,PD_2_1,SPECIMEN_4,,Durie-Salmon staging system,23/12/2021,RNA later froz + DONOR_2,PD_2,SPECIMEN_5,TR_7,Durie-Salmon staging system,07/12/2020,Frozen in -70 freezer,,C15.9,,,,,,,Formalin fixed & paraffin embedded,,,,,,,,,,,,,,, + DONOR_5,PD_5,SPECIMEN_6,,Durie-Salmon staging system,20/04/2021,Cut slide,8124/9,C15.9,,Not done,IASLC grading system,G3,51-100%,Pathology estimate by percent nuclei,Formalin fixed - buffered,,,,,,,,,,,,,,, + DONOR_2,PD_2_1,SPECIMEN_7,,Durie-Salmon staging system,23/02/2021,RNA later frozen,,C43.9,,,,,,,Cryopreservation - other,,,,,,,,,,,,,,, ++CMPLT_AF,PD_AF,SPEC_AF_T,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,Yes,Yes,Two-tier grading system,Low grade,51-100%,Image analysis,,,,,,,,,,,,,,,, ++CMPLT_AF,PD_AF,SPEC_AF_N,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, ++CMPLT_BF,PD_BF,SPEC_BF_T,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,Yes,Yes,Two-tier grading system,Low grade,51-100%,Image analysis,,,,,,,,,,,,,,,, ++CMPLT_BF,PD_BF,SPEC_BF_N,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, ++CMPLT_AM,PD_AM,SPEC_AM_T,,,1/8/2018,,,C50.1,Yes,Yes,Two-tier grading system,Low grade,51-100%,Image analysis,,,,,,,,,,,,,,,, ++CMPLT_AM,PD_AM,SPEC_AM_N,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, ++CMPLT_BM,PD_BM,SPEC_BM_T,,,1/8/2018,,,C50.1,Yes,Yes,Two-tier grading system,Low grade,51-100%,Image analysis,,,,,,,,,,,,,,,, ++CMPLT_BM,PD_BM,SPEC_BM_N,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, ++CMPLT_INC,PD_INC,SPEC_INC_N,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, ++CMPLT_COV1,PD_COV1,SPEC_COV1_T,,,1/3/2018,Frozen in liquid nitrogen,,C50.1,Yes,Yes,Two-tier grading system,Low grade,51-100%,Image analysis,,,,,,,,,,,,,,,, ++CMPLT_COV1,PD_COV1,SPEC_COV1_N,,,1/3/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, ++CMPLT_COV2,PD_COV2,SPEC_COV2_T,,,1/3/2018,Frozen in liquid nitrogen,,C50.1,Yes,Yes,Two-tier grading system,Low grade,51-100%,Image analysis,,,,,,,,,,,,,,,, ++CMPLT_COV2,PD_COV2,SPEC_COV2_N,,,1/3/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, +diff --git a/tests/raw_data/Surgery.csv b/tests/raw_data/Surgery.csv +index ee22ddd..95b8ac8 100644 +--- a/tests/raw_data/Surgery.csv ++++ b/tests/raw_data/Surgery.csv +@@ -1,3 +1,5 @@ + submitter_donor_id,submitter_specimen_id,submitter_treatment_id,surgery_reference_database,surgery_reference_identifier,surgery_type,surgery_site,surgery_location,tumour_length,tumour_width,greatest_dimension_tumour,tumour_focality,residual_tumour_classification,margin_types_involved,margin_types_not_involved,lymphovascular_invasion,margin_types_not_assessed,perineural_invasion + DONOR_2,SPECIMEN_4,TR_7,SNOMED,178294003,Axillary lymph nodes sampling,C14,Primary,9,7,5,Unifocal,R2,Distal margin|Circumferential resection margin,,Absent,Not available,Absent + DONOR_6,SPECIMEN_43,TR_9,NCIt,C15361,Fine needle aspiration biopsy,C14,Primary,9,7,5,Unifocal,R2,Distal margin|Circumferential resection margin,,Absent,Not available,Absent ++CMPLT_COV1,SPEC_COV1_T,TR_COV1,SNOMED,178294003,Excision,,,,,,,,,,,, ++CMPLT_COV2,SPEC_COV2_T,TR_COV2,SNOMED,178294003,Excision,,,,,,,,,,,, +diff --git a/tests/raw_data/SystemicTherapy.csv b/tests/raw_data/SystemicTherapy.csv +index 7f0909c..8a1dfc4 100644 +--- a/tests/raw_data/SystemicTherapy.csv ++++ b/tests/raw_data/SystemicTherapy.csv +@@ -4,3 +4,5 @@ DONOR_2,TR_2,Chemotherapy,NIVOLUMAB,87333,mg/m2,150,111,PubChem,5,2,1/04/2020,1/ + DONOR_3,TR_3,Hormone therapy,degarelix,46475,ug/m2,179,97,PubChem,6,3,10/12/2020,19/12/2021 + DONOR_4,TR_4,Immunotherapy,Pembrolizumab,4459876,IU/kg,95,160,RxNorm,4,2,1/3/2021,12/12/2021 + DONOR_2,TR_8,Immunotherapy,Pexidartinib,8836851,ug/m2,197,183,PubChem,6,1,9/5/2021,6/6/2023 ++CMPLT_COV1,TR_COV1,Chemotherapy,Cisplatin,12345,,,,RxNorm,,,1/3/2018,1/6/2018 ++CMPLT_COV2,TR_COV2,Chemotherapy,Cisplatin,12345,,,,RxNorm,,,1/3/2018,1/6/2018 +diff --git a/tests/raw_data/Treatment.csv b/tests/raw_data/Treatment.csv +index 6b5d854..da6e6c7 100644 +--- a/tests/raw_data/Treatment.csv ++++ b/tests/raw_data/Treatment.csv +@@ -9,3 +9,7 @@ TR_7,DONOR_2,PD_2_1,Surgery,Yes,01/02/2021,01/02/2022,Diagnostic,,Progressive di + TR_8,DONOR_2,PD_2_1,Systemic therapy,No,01/03/2021,01/03/2022,Forensic,AML Response Criteria,Immune confirmed progressive disease (iCPD),Other + TR_9,DONOR_6,PD_6,Surgery,No,01/02/2021,01/02/2022,Diagnostic,Blazer score,Progressive disease, + TR_10,DONOR_5,PD_5,Systemic therapy,No,01/02/2021,01/02/2022,Forensic,Response Assessment in Neuro-Oncology (RANO),, ++TR_AF,CMPLT_AF,PD_AF,Bone marrow transplant,Yes,1/7/2018,1/9/2018,Curative,,, ++TR_BF,CMPLT_BF,PD_BF,Bone marrow transplant,Yes,1/7/2018,1/9/2018,Curative,,, ++TR_COV1,CMPLT_COV1,PD_COV1,Systemic therapy|Radiation therapy|Surgery,Yes,1/2/2018,1/12/2018,Curative,,, ++TR_COV2,CMPLT_COV2,PD_COV2,Systemic therapy|Radiation therapy|Surgery,Yes,1/2/2018,1/12/2018,Curative,,, +diff --git a/tests/test2mohv3.csv b/tests/test2mohv3.csv +index 0a1658e..49a314d 100644 +--- a/tests/test2mohv3.csv ++++ b/tests/test2mohv3.csv +@@ -132,11 +132,11 @@ DONOR.INDEX.comorbidities.INDEX.age_at_comorbidity_diagnosis_not_available, {num + DONOR.INDEX.comorbidities.INDEX.comorbidity_type_code, {single_val(Comorbidity.comorbidity_type_code)} + DONOR.INDEX.comorbidities.INDEX.comorbidity_treatment_status, {single_val(Comorbidity.comorbidity_treatment_status)} + DONOR.INDEX.comorbidities.INDEX.comorbidity_treatment, {single_val(Comorbidity.comorbidity_treatment)} +-DONOR.INDEX.exposures.INDEX, {indexed_on(EXPOSURES_SHEET.submitter_donor_id)} +-DONOR.INDEX.exposures.INDEX.tobacco_smoking_status, {single_val(EXPOSURES_SHEET.tobacco_smoking_status)} +-DONOR.INDEX.exposures.INDEX.tobacco_type, {pipe_delim(EXPOSURES_SHEET.tobacco_type)} +-DONOR.INDEX.exposures.INDEX.pack_years_smoked, {set_neg_99_blank_int(EXPOSURES_SHEET.pack_years_smoked)} +-DONOR.INDEX.exposures.INDEX.pack_years_smoked_not_available, {numeric_not_available(EXPOSURES_SHEET.pack_years_smoked)} ++DONOR.INDEX.exposures.INDEX, {indexed_on(Exposure.submitter_donor_id)} ++DONOR.INDEX.exposures.INDEX.tobacco_smoking_status, {single_val(Exposure.tobacco_smoking_status)} ++DONOR.INDEX.exposures.INDEX.tobacco_type, {pipe_delim(Exposure.tobacco_type)} ++DONOR.INDEX.exposures.INDEX.pack_years_smoked, {set_neg_99_blank_int(Exposure.pack_years_smoked)} ++DONOR.INDEX.exposures.INDEX.pack_years_smoked_not_available, {numeric_not_available(Exposure.pack_years_smoked)} + DONOR.INDEX.biomarkers.INDEX, {indexed_on(Biomarker.submitter_donor_id)} + DONOR.INDEX.biomarkers.INDEX.submitter_specimen_id, {single_val(Biomarker.submitter_specimen_id)} + DONOR.INDEX.biomarkers.INDEX.submitter_primary_diagnosis_id, {single_val(Biomarker.submitter_primary_diagnosis_id)} +diff --git a/tests/test_data_ingest.py b/tests/test_data_ingest.py +index ae24b28..4863325 100644 +--- a/tests/test_data_ingest.py ++++ b/tests/test_data_ingest.py +@@ -31,8 +31,9 @@ def packets(): + + + def test_csv_convert(packets): +- # there are 6 donors +- assert len(packets) == 6 ++ # 6 original sample donors + 5 completeness fixtures (CMPLT_AF/BF/AM/BM/INC) ++ # + 2 full-coverage fulsome donors (CMPLT_COV1/COV2) ++ assert len(packets) == 13 + + + def test_external_mapping(packets): +@@ -79,7 +80,11 @@ def test_donor_2(packets): + + + def test_validation(packets, schema): +- schema.validate_ingest_map({"donors": packets}) ++ # Scope validation to the original sample donors so the expected warning / ++ # error lists below are unaffected by the CMPLT_* completeness fixtures. ++ original_ids = {"DONOR_1", "DONOR_2", "DONOR_3", "DONOR_4", "DONOR_5", "DONOR_6"} ++ original = [p for p in packets if p["submitter_donor_id"] in original_ids] ++ schema.validate_ingest_map({"donors": original}) + print(schema.validation_warnings) + warnings = [ + "DONOR_2 > PD_2: date_of_diagnosis required for primary_diagnoses", +@@ -144,3 +149,52 @@ def test_multisheet_mapping(packets): + assert len(s["multisheet"]["placeholder"]["submitter_specimen_id"]["Sample_Registration"]) == 0 + assert len(s["multisheet"]["placeholder"]["extra"]["Sample_Registration"]) == 0 + ++ ++# Per-donor tier/level completeness summary over the full cohort. ++# The tests/raw_data fixtures include five CMPLT_* donors purpose-built to land ++# in each summary bucket: ++# CMPLT_AF -> Tier A, fulsome CMPLT_BF -> Tier B, fulsome ++# CMPLT_AM -> Tier A, minimal CMPLT_BM -> Tier B, minimal ++# CMPLT_INC -> untiered (single normal DNA sample) -> incomplete ++def test_completeness_summary(packets, schema): ++ schema.validate_ingest_map({"donors": packets}) ++ summary = CSVConvert.summarize_completeness(schema.statistics["donor_completeness"]) ++ ++ assert summary["total_donors"] == 13 ++ # each axis partitions all donors exactly once ++ assert (summary["tier_a_min_clinical_complete"] ++ + summary["tier_b_min_clinical_complete"] ++ + summary["incomplete_min_donors"]) == 13 ++ assert (summary["tier_a_full_clinical_complete"] ++ + summary["tier_b_full_clinical_complete"] ++ + summary["incomplete_full_donors"]) == 13 ++ # the CMPLT_* donors populate each category (Tier A donors are not also ++ # counted toward Tier B); original donors only add to the incomplete buckets ++ assert summary["tier_a_min_clinical_complete"] == 3 # CMPLT_AF, CMPLT_AM, CMPLT_COV1 ++ assert summary["tier_b_min_clinical_complete"] == 3 # CMPLT_BF, CMPLT_BM, CMPLT_COV2 ++ assert summary["tier_a_full_clinical_complete"] == 2 # CMPLT_AF, CMPLT_COV1 ++ assert summary["tier_b_full_clinical_complete"] == 2 # CMPLT_BF, CMPLT_COV2 ++ assert summary["incomplete_min_donors"] >= 1 # CMPLT_INC (+ originals) ++ assert summary["incomplete_full_donors"] >= 3 # CMPLT_AM, CMPLT_BM, CMPLT_INC (+ originals) ++ ++ ++# CMPLT_COV1 / CMPLT_COV2 populate every object type in the model, with all ++# required and conditionally-required fields filled, so they should come out ++# fulsome complete. This guards against the required-field lists drifting out of ++# sync with the model (a newly-required field would make these donors fail). ++def test_full_object_coverage_donors_are_fulsome(packets, schema): ++ schema.validate_ingest_map({"donors": packets}) ++ dc = schema.statistics["donor_completeness"] ++ for donor_id in ("CMPLT_COV1", "CMPLT_COV2"): ++ assert dc[donor_id]["fulsome_complete"] is True, dc[donor_id]["fulsome_unmet"] ++ assert dc[donor_id]["fulsome_unmet"] == [] ++ ++ cov = next(p for p in packets if p["submitter_donor_id"] == "CMPLT_COV1") ++ # donor-level objects ++ for key in ("primary_diagnoses", "followups", "biomarkers", "comorbidities", "exposures"): ++ assert cov.get(key), f"CMPLT_COV1 missing {key}" ++ pd = cov["primary_diagnoses"][0] ++ assert pd.get("specimens") and pd["specimens"][0].get("sample_registrations") ++ tr = pd["treatments"][0] ++ for key in ("systemic_therapies", "radiations", "surgeries"): ++ assert tr.get(key), f"CMPLT_COV1 treatment missing {key}" +diff --git a/tests/test_donor_completeness.py b/tests/test_donor_completeness.py +new file mode 100644 +index 0000000..76873f7 +--- /dev/null ++++ b/tests/test_donor_completeness.py +@@ -0,0 +1,271 @@ ++"""Tests for per-donor tier/level completeness (BaseSchema completeness engine). ++ ++Offline by design: MoHSchemaV3.__init__ fetches the OpenAPI schema over the ++network, but the completeness engine only needs the class-level criteria and ++the validation pass. We instantiate via __new__ and supply a permissive ++json_schema ({} validates anything) so validate_ingest_map runs without network. ++ ++'fulsome' completeness is derived from the validation pass, so these tests run ++the donor(s) through schema.validate_ingest_map and read the resulting ++statistics["donor_completeness"], rather than calling the engine in isolation. ++""" ++ ++import pytest ++ ++from clinical_etl.mohschemav3 import MoHSchemaV3 ++from clinical_etl.CSVConvert import summarize_completeness, build_completeness_failures ++ ++ ++@pytest.fixture ++def schema(): ++ s = MoHSchemaV3.__new__(MoHSchemaV3) # bypass network __init__ ++ s.validation_warnings = [] ++ s.validation_errors = [] ++ s.statistics = {} ++ s.identifiers = {} ++ s.stack_location = [] ++ s.json_schema = {} # permissive: no jsonschema errors ++ return s ++ ++ ++def evaluate(schema, *donors): ++ """Run donors through the full validation pass and return the per-donor ++ completeness records keyed by donor id.""" ++ schema.validate_ingest_map({"donors": list(donors)}) ++ return schema.statistics["donor_completeness"] ++ ++ ++# --- fixture builders ------------------------------------------------------ # ++ ++def sample(tn, stype, sid): ++ return { ++ "submitter_sample_id": sid, ++ "tumour_normal_designation": tn, ++ "specimen_tissue_source": "Blood derived", ++ "specimen_type": "Primary tumour", ++ "sample_type": stype, ++ } ++ ++ ++def tumour_dna(sid="S_TDNA"): ++ return sample("Tumour", "Total DNA", sid) ++ ++ ++def tumour_rna(sid="S_TRNA"): ++ return sample("Tumour", "Total RNA", sid) ++ ++ ++def normal_dna(sid="S_NDNA"): ++ return sample("Normal", "Total DNA", sid) ++ ++ ++def treatment(): ++ # treatment_type that does not require nested therapy/radiation/surgery objects ++ return { ++ "submitter_treatment_id": "TR1", ++ "treatment_type": ["Bone marrow transplant"], ++ "is_primary_treatment": "Yes", ++ "treatment_start_date": {"month_interval": 0}, ++ "treatment_end_date": {"month_interval": 1}, ++ "treatment_intent": "Curative", ++ } ++ ++ ++def build_donor(donor_id="DONOR", samples=None, deceased="No", ++ with_specimen_storage=True, with_staging=True, ++ with_tumour_specimen_fields=True, with_treatment=True): ++ """Build a donor that is fully (fulsome) complete by default; flip a knob ++ to introduce a specific gap.""" ++ if samples is None: ++ samples = [tumour_dna(), tumour_rna(), normal_dna()] ++ specimen = { ++ "submitter_specimen_id": "SP1", ++ "specimen_collection_date": {"month_interval": 0}, ++ "specimen_anatomic_location": "C50", ++ "sample_registrations": samples, ++ } ++ if with_specimen_storage: ++ specimen["specimen_storage"] = "Frozen in liquid nitrogen" ++ if with_tumour_specimen_fields: ++ specimen.update({ ++ "reference_pathology_confirmed_diagnosis": "Yes", ++ "reference_pathology_confirmed_tumour_presence": "Yes", ++ "tumour_grading_system": "Two-tier grading system", ++ "tumour_grade": "Low grade", ++ "percent_tumour_cells_range": "51-100%", ++ "percent_tumour_cells_measurement_method": "Image analysis", ++ }) ++ primary_diagnosis = { ++ "submitter_primary_diagnosis_id": "PD1", ++ "date_of_diagnosis": {"month_interval": 0}, ++ "cancer_type_code": "C50.1", ++ "primary_site": "Breast", ++ "basis_of_diagnosis": "Histology of primary tumour", ++ "specimens": [specimen], ++ } ++ if with_treatment: ++ primary_diagnosis["treatments"] = [treatment()] ++ if with_staging: ++ primary_diagnosis["clinical_tumour_staging_system"] = "Revised International staging system (R-ISS)" ++ primary_diagnosis["clinical_stage_group"] = "Stage I" ++ return { ++ "submitter_donor_id": donor_id, ++ "gender": "Woman", ++ "sex_at_birth": "Female", ++ "date_of_birth": {"month_interval": 0}, ++ "date_resolution": "month", ++ "is_deceased": deceased, ++ "program_id": "PROGRAM_1", ++ "primary_diagnoses": [primary_diagnosis], ++ } ++ ++ ++# --- _field_present: "Not available" counts as complete -------------------- # ++ ++def test_not_available_is_a_valid_value(schema): ++ assert schema._field_present({"x": "Not available"}, "x") is True ++ assert schema._field_present({"x": "Woman"}, "x") is True ++ assert schema._field_present({"x": ""}, "x") is False ++ assert schema._field_present({"x": None}, "x") is False ++ assert schema._field_present({}, "x") is False ++ ++ ++# --- tier classification (exclusive) --------------------------------------- # ++ ++def test_tier_a(schema): ++ rec = evaluate(schema, build_donor())["DONOR"] ++ assert rec["tier"] == "A" ++ assert rec["sample_counts"] == {"tumour_dna": 1, "tumour_rna": 1, "normal_dna": 1} ++ assert rec["tier_criteria_met"] == {"A": True, "B": True} # diagnostic overlap only ++ ++ ++def test_tier_b(schema): ++ rec = evaluate(schema, build_donor(samples=[tumour_dna(), normal_dna()]))["DONOR"] ++ assert rec["tier"] == "B" ++ assert rec["tier_criteria_met"] == {"A": False, "B": True} ++ ++ ++def test_tier_none_when_composition_incomplete(schema): ++ rec = evaluate(schema, build_donor(samples=[tumour_dna()]))["DONOR"] ++ assert rec["tier"] is None ++ ++ ++def test_summary_buckets(schema): ++ recs = evaluate( ++ schema, ++ # Tier A, fulsome ++ build_donor(donor_id="DONOR_AF"), ++ # Tier B, fulsome ++ build_donor(donor_id="DONOR_BF", samples=[tumour_dna(), normal_dna()]), ++ # Tier A, minimal only (missing conditional staging -> not fulsome) ++ build_donor(donor_id="DONOR_AM", with_staging=False), ++ # No qualifying tier (single tumour DNA sample) ++ build_donor(donor_id="DONOR_N", samples=[tumour_dna()]), ++ ) ++ summary = summarize_completeness(recs) ++ assert summary["total_donors"] == 4 ++ # minimal partition (sums to 4); Tier A donor never counted toward Tier B ++ assert summary["tier_a_min_clinical_complete"] == 2 # AF, AM ++ assert summary["tier_b_min_clinical_complete"] == 1 # BF ++ assert summary["incomplete_min_donors"] == 1 # N ++ # fulsome partition (sums to 4) ++ assert summary["tier_a_full_clinical_complete"] == 1 # AF ++ assert summary["tier_b_full_clinical_complete"] == 1 # BF ++ assert summary["incomplete_full_donors"] == 2 # AM (minimal only), N ++ ++ ++# --- fulsome vs minimal ---------------------------------------------------- # ++ ++def test_fully_complete_donor_is_fulsome(schema): ++ rec = evaluate(schema, build_donor())["DONOR"] ++ assert rec["fulsome_unmet"] == [] ++ assert rec["fulsome_complete"] is True ++ assert rec["minimal_complete"] is True ++ assert rec["level"] == "fulsome" ++ assert rec["type"] == "Tier A fulsome" ++ ++ ++def test_missing_flat_required_breaks_fulsome(schema): ++ # specimen_storage is required but is not part of the minimal set ++ rec = evaluate(schema, build_donor(with_specimen_storage=False))["DONOR"] ++ assert rec["minimal_complete"] is True ++ assert rec["fulsome_complete"] is False ++ assert rec["level"] == "minimal" ++ assert any("specimen_storage" in u for u in rec["fulsome_unmet"]) ++ ++ ++def test_missing_treatment_breaks_fulsome(schema): ++ # every donor must have >= 1 treatment object (required_instances) ++ rec = evaluate(schema, build_donor(with_treatment=False))["DONOR"] ++ assert rec["fulsome_complete"] is False ++ assert any("treatments" in u for u in rec["fulsome_unmet"]) ++ assert rec["minimal_complete"] is True # treatment existence is not a minimal criterion ++ ++ ++def test_missing_staging_is_a_conditional_gap(schema): ++ # conditional requirement raised in validate_primary_diagnoses ++ rec = evaluate(schema, build_donor(with_staging=False))["DONOR"] ++ assert rec["fulsome_complete"] is False ++ assert any("clinical_tumour_staging_system" in u or "staging" in u ++ for u in rec["fulsome_unmet"]) ++ assert rec["minimal_complete"] is True # staging not in the minimal set ++ ++ ++def test_missing_tumour_specimen_fields_is_a_conditional_gap(schema): ++ # conditional requirement raised in validate_specimens for Tumour samples ++ rec = evaluate(schema, build_donor(with_tumour_specimen_fields=False))["DONOR"] ++ assert rec["fulsome_complete"] is False ++ assert any("Tumour specimens require" in u for u in rec["fulsome_unmet"]) ++ assert rec["minimal_complete"] is True ++ ++ ++def test_deceased_without_death_fields_is_a_conditional_gap(schema): ++ rec = evaluate(schema, build_donor(deceased="Yes"))["DONOR"] ++ assert rec["fulsome_complete"] is False ++ assert any("cause_of_death" in u for u in rec["fulsome_unmet"]) ++ assert any("date_of_death" in u for u in rec["fulsome_unmet"]) ++ assert rec["minimal_complete"] is True # death fields not in the minimal set ++ ++ ++# --- "Not available" rule flows through fulsome ---------------------------- # ++ ++def test_not_available_keeps_donor_fulsome(schema): ++ donor = build_donor() ++ donor["gender"] = "Not available" ++ rec = evaluate(schema, donor)["DONOR"] ++ assert rec["fulsome_complete"] is True ++ ++ ++def test_blank_value_breaks_fulsome(schema): ++ donor = build_donor() ++ donor["gender"] = "" ++ rec = evaluate(schema, donor)["DONOR"] ++ assert rec["fulsome_complete"] is False ++ assert any(u.endswith(".gender") for u in rec["fulsome_unmet"]) ++ ++ ++# --- detailed failure report ----------------------------------------------- # ++ ++def test_completeness_failures_report(schema): ++ recs = evaluate( ++ schema, ++ build_donor(donor_id="DONOR_AF"), # fully complete ++ build_donor(donor_id="DONOR_AM", with_staging=False), # tier A, not fulsome ++ build_donor(donor_id="DONOR_N", samples=[tumour_dna()]), # untiered ++ ) ++ report = build_completeness_failures(recs, schema.tier_criteria) ++ ++ assert report["total_donors"] == 3 ++ assert report["failing_donors"] == 2 ++ ids = {d["donor_id"] for d in report["donors"]} ++ assert "DONOR_AF" not in ids # fully complete -> excluded ++ assert ids == {"DONOR_AM", "DONOR_N"} ++ ++ am = next(d for d in report["donors"] if d["donor_id"] == "DONOR_AM") ++ assert am["fulsome_complete"] is False ++ assert any("fulsome" in r.lower() for r in am["reasons"]) ++ assert any("staging" in u.lower() for u in am["fulsome_unmet"]) ++ ++ n = next(d for d in report["donors"] if d["donor_id"] == "DONOR_N") ++ assert n["tier"] is None ++ assert any("Sample composition" in r for r in n["reasons"]) diff --git a/src/clinical_etl/CSVConvert.py b/src/clinical_etl/CSVConvert.py index 2aeb8793..e51d760f 100644 --- a/src/clinical_etl/CSVConvert.py +++ b/src/clinical_etl/CSVConvert.py @@ -635,6 +635,101 @@ def load_manifest(manifest_file): return result +def summarize_completeness(donor_completeness): + """Aggregate per-donor completeness into ID-free counts. + + Produces two independent partitions of all donors: + * minimal: tier_a_min_clinical_complete + tier_b_min_clinical_complete + + incomplete_min_donors + * fulsome: tier_a_full_clinical_complete + tier_b_full_clinical_complete + + incomplete_full_donors + A donor is counted in a tier bucket only if it meets that tier AND the + relevant completeness level; everything else (wrong/absent tier, or not + complete) falls into the matching incomplete bucket. Tier assignment is + exclusive, so a Tier A donor is never counted toward a Tier B bucket.""" + summary = { + "total_donors": len(donor_completeness), + "tier_a_min_clinical_complete": 0, + "tier_b_min_clinical_complete": 0, + "incomplete_min_donors": 0, + "tier_a_full_clinical_complete": 0, + "tier_b_full_clinical_complete": 0, + "incomplete_full_donors": 0, + } + for rec in donor_completeness.values(): + tier = rec["tier"] + # minimal partition + if tier == "A" and rec["minimal_complete"]: + summary["tier_a_min_clinical_complete"] += 1 + elif tier == "B" and rec["minimal_complete"]: + summary["tier_b_min_clinical_complete"] += 1 + else: + summary["incomplete_min_donors"] += 1 + # fulsome partition + if tier == "A" and rec["fulsome_complete"]: + summary["tier_a_full_clinical_complete"] += 1 + elif tier == "B" and rec["fulsome_complete"]: + summary["tier_b_full_clinical_complete"] += 1 + else: + summary["incomplete_full_donors"] += 1 + return summary + + +def build_completeness_failures(donor_completeness, tier_criteria=None): + """Build a detailed per-donor report of every donor that is not fully + (tier + fulsome) complete, with the reasons it failed. + + A donor is considered failing unless it is assigned a tier (A or B) AND is + fulsome complete. For each failing donor the report lists the offending + sample composition and/or the specific unmet minimal and fulsome fields.""" + def _tier_requirement_text(): + if not tier_criteria: + return "any tier" + parts = [] + for tier, crit in tier_criteria.items(): + desc = ", ".join(f"{n} {kind}" for kind, n in crit.items()) + parts.append(f"Tier {tier} ({desc})") + return " or ".join(parts) + + failing = [] + for donor_id, rec in donor_completeness.items(): + tiered = rec["tier"] in ("A", "B") + if tiered and rec["fulsome_complete"]: + continue # fully complete -> not a failure + + reasons = [] + if not tiered: + reasons.append( + f"Sample composition does not satisfy {_tier_requirement_text()}; " + f"found {rec['sample_counts'] or 'no classifiable tumour/normal DNA/RNA samples'}" + ) + if not rec["minimal_complete"]: + reasons.append( + f"Fails minimal clinical completeness: {len(rec['minimal_unmet'])} field(s) missing" + ) + if not rec["fulsome_complete"]: + reasons.append( + f"Fails fulsome clinical completeness: {len(rec['fulsome_unmet'])} " + f"required/conditionally-required field(s) missing" + ) + failing.append({ + "donor_id": donor_id, + "tier": rec["tier"], + "minimal_complete": rec["minimal_complete"], + "fulsome_complete": rec["fulsome_complete"], + "reasons": reasons, + "sample_counts": rec["sample_counts"], + "minimal_unmet": rec["minimal_unmet"], + "fulsome_unmet": rec["fulsome_unmet"], + }) + + return { + "total_donors": len(donor_completeness), + "failing_donors": len(failing), + "donors": failing, + } + + def csv_convert(input_path, manifest_file, minify=False, index_output=False, verbose=False): mappings.VERBOSE = verbose # read manifest data @@ -756,9 +851,24 @@ def csv_convert(input_path, manifest_file, minify=False, index_output=False, ver schema.validate_ingest_map(result) validation_results = {"validation_errors": schema.validation_errors, "validation_warnings": schema.validation_warnings, - "cases_missing_data": schema.statistics["cases_missing_data"]} + "cases_missing_data": schema.statistics["cases_missing_data"], + "donor_completeness": schema.statistics.get("donor_completeness", {})} result["statistics"] = schema.statistics result["statistics"].pop("cases_missing_data") # remove donor IDs from _map.json file + # per-donor completeness is keyed by donor ID: keep it out of _map.json too, + # but retain an aggregate tier/level summary (no IDs) in the statistics. + donor_completeness = result["statistics"].pop("donor_completeness", {}) + result["statistics"]["completeness_summary"] = summarize_completeness(donor_completeness) + # write a detailed per-donor completeness failure report (contains donor IDs, + # so it is kept out of _map.json, like the validation results) + if donor_completeness: + completeness_failures = build_completeness_failures( + donor_completeness, getattr(schema, "tier_criteria", None)) + with open(f"{input_path}_completeness_failures.json", 'w') as f: + json.dump(completeness_failures, f, indent=4) + print(f"{Bcolors.OKGREEN}Completeness failure report (" + f"{completeness_failures['failing_donors']}/{completeness_failures['total_donors']} " + f"donors) written to {input_path}_completeness_failures.json{Bcolors.ENDC}") # write ingestion and validation json files print(f"{Bcolors.OKGREEN}Saving packets to file.{Bcolors.ENDC}") diff --git a/src/clinical_etl/completeness_table.py b/src/clinical_etl/completeness_table.py index f3155099..f3568936 100644 --- a/src/clinical_etl/completeness_table.py +++ b/src/clinical_etl/completeness_table.py @@ -27,7 +27,39 @@ def generate_csv(input_path): out.write(f"{k},{field},{total},{missing},{round(missing_percent)}\n") +def generate_donor_completeness_csv(input_path): + """Write a per-donor tier/level completeness table from a + *_validation_results.json file (which holds the donor-ID-keyed records).""" + output_path = input_path.replace("_validation_results.json", "_donor_completeness.csv") + print(f"Converting {input_path} to {output_path}") + with open(input_path) as f: + donors = json.load(f).get("donor_completeness", {}) + with open(output_path, "w") as out: + out.write("Donor,Tier,Level,Type,Minimal Complete,Fulsome Complete,Unmet (fulsome)\n") + for donor_id, rec in donors.items(): + out.write( + f"{donor_id},{rec['tier'] or ''},{rec['level']},{rec['type']}," + f"{rec['minimal_complete']},{rec['fulsome_complete']}," + f"{'|'.join(rec['fulsome_unmet'])}\n" + ) + + +def main(input_path): + """Dispatch on file type: aggregate field stats from a _map.json, or the + per-donor tier/level table from a _validation_results.json.""" + with open(input_path) as f: + data = json.load(f) + if "donor_completeness" in data: + generate_donor_completeness_csv(input_path) + elif "statistics" in data: + generate_csv(input_path) + else: + raise SystemExit( + "Input json has neither 'statistics' (a _map.json) nor " + "'donor_completeness' (a _validation_results.json)." + ) + + if __name__ == "__main__": args = parse_args() - input_path = args.input - generate_csv(input_path) + main(args.input) diff --git a/src/clinical_etl/mohschemav3.py b/src/clinical_etl/mohschemav3.py index 2b691a1e..09e1fa90 100644 --- a/src/clinical_etl/mohschemav3.py +++ b/src/clinical_etl/mohschemav3.py @@ -163,6 +163,64 @@ class MoHSchemaV3(BaseSchema): } } + # ------------------------------------------------------------------ # + # Per-donor completeness criteria (consumed by BaseSchema engine) # + # ------------------------------------------------------------------ # + + # Tier = sample_registration composition. Ordered strongest-first so that + # a donor satisfying both is assigned the higher tier (A) and is therefore + # NOT also counted in the Tier B total. Criteria are cumulative: Tier A's + # required samples are a superset of Tier B's. + tier_criteria = { + "A": {"tumour_dna": 1, "tumour_rna": 1, "normal_dna": 1}, + "B": {"tumour_dna": 1, "normal_dna": 1}, + } + + # Minimal completeness: reduced field set that must hold valid values on + # every existing instance of each object type. + minimal_criteria = { + "donors": ["gender", "sex_at_birth", "date_of_birth", "date_resolution"], + "primary_diagnoses": ["date_of_diagnosis", "cancer_type_code", "primary_site", "basis_of_diagnosis"], + "specimens": ["specimen_collection_date", "specimen_anatomic_location"], + "sample_registrations": ["specimen_tissue_source", "tumour_normal_designation", "specimen_type", "sample_type"], + } + + # Nested objects every donor must have for 'fulsome' completeness. Counted + # anywhere in the donor tree (e.g. treatments live under primary_diagnoses). + required_instances = [ + {"key": "treatments", "min": 1}, + ] + + # Conditionally-required fields are NOT re-listed here. 'fulsome' completeness + # is derived directly from the validation pass: every conditional requirement + # in the validate_* methods raises warn(..., conditional_required=True), and + # those warnings are attributed per-donor and fed into the fulsome check + # (see BaseSchema._evaluate_fulsome). Soft notes / consistency warnings are + # marked conditional_required=False so they don't affect completeness. + + @staticmethod + def _sample_kind(sample): + """Classify a sample_registration as e.g. 'tumour_dna' / 'normal_dna'. + + ASSUMPTION: the molecule (DNA vs RNA) is read from `sample_type`. + If the MoH model encodes it in a different field, change ONLY this + method (e.g. read 'specimen_type' or an analyte field instead).""" + designation = (sample.get("tumour_normal_designation") or "").lower() + sample_type = (sample.get("sample_type") or "").lower() + if "rna" in sample_type: + molecule = "rna" + elif "dna" in sample_type: + molecule = "dna" + else: + molecule = None + if "tumour" in designation or "tumor" in designation: + tn = "tumour" + elif "normal" in designation: + tn = "normal" + else: + tn = None + return f"{tn}_{molecule}" if (tn and molecule) else None + def validate_donors(self, map_json): for prop in map_json: match prop: @@ -181,7 +239,8 @@ def validate_donors(self, map_json): if map_json["lost_to_followup_reason"] is not None: if "lost_to_followup_after_clinical_event_identifier" not in map_json: self.warn( - "lost_to_followup_reason should only be submitted if lost_to_followup_after_clinical_event_identifier is submitted") + "lost_to_followup_reason should only be submitted if lost_to_followup_after_clinical_event_identifier is submitted", + conditional_required=False) case "date_alive_after_lost_to_followup": if map_json["date_alive_after_lost_to_followup"] is not None: if "lost_to_followup_after_clinical_event_identifier" not in map_json: @@ -239,7 +298,7 @@ def validate_donors(self, map_json): if ('diagnosis_date' in locals() and diagnosis_date not in [None, ''] and treatment_end not in [None, ''] and 'treatment_end' in locals() and treatment_end < diagnosis_date): - self.warn(f"{diagnosis['submitter_primary_diagnosis_id']} > {treatment['submitter_treatment_id']}: date_of_diagnosis should be earlier than treatment_end_date ") + self.warn(f"{diagnosis['submitter_primary_diagnosis_id']} > {treatment['submitter_treatment_id']}: date_of_diagnosis should be earlier than treatment_end_date ", conditional_required=False) if 'treatment_start' in locals() and treatment_start not in [None, '']: if 'death' in locals() and death not in [None, ''] and treatment_start > death: self.fail( @@ -247,12 +306,12 @@ def validate_donors(self, map_json): if 'birth' in locals() and birth not in [None, ''] and treatment_start < birth and treatment_start is not None: self.fail(f"{diagnosis['submitter_primary_diagnosis_id']} > {treatment['submitter_treatment_id']}: treatment_start_date cannot be before date_of_birth") if 'diagnosis_date' in locals() and diagnosis_date not in [None, ''] and treatment_start < diagnosis_date: - self.warn(f"{diagnosis['submitter_primary_diagnosis_id']} > {treatment['submitter_treatment_id']}: treatment_start_date should not be before date_of_diagnosis") + self.warn(f"{diagnosis['submitter_primary_diagnosis_id']} > {treatment['submitter_treatment_id']}: treatment_start_date should not be before date_of_diagnosis", conditional_required=False) diagnosis_values_list = list(diagnoses_dates.values()) if (len(diagnosis_values_list) > 0 and "int" in str(type(diagnosis_values_list[0])) and 0 not in diagnosis_values_list): self.warn(f"Earliest primary_diagnosis.date_of_diagnosis.month_interval should be 0, current " - f"month_intervals: {diagnoses_dates}") + f"month_intervals: {diagnoses_dates}", conditional_required=False) case "date_of_death": if map_json["date_of_death"] is not None: if map_json["is_deceased"] in ["No", "Not available"]: @@ -286,7 +345,7 @@ def validate_donors(self, map_json): def validate_primary_diagnoses(self, map_json): if map_json["date_of_diagnosis"] is None: - self.warn("NOTE: cannot calculate any date intervals for this patient without date_of_diagnosis") + self.warn("NOTE: cannot calculate any date intervals for this patient without date_of_diagnosis", conditional_required=False) if "clinical_tumour_staging_system" not in map_json and "pathological_tumour_staging_system" not in map_json: self.warn("Either clinical_tumour_staging_system or pathological_staging_system is required") for prop in map_json: diff --git a/src/clinical_etl/schema.py b/src/clinical_etl/schema.py index 1609eb89..2ed0fc0a 100644 --- a/src/clinical_etl/schema.py +++ b/src/clinical_etl/schema.py @@ -49,6 +49,13 @@ class BaseSchema: # The component name in the OpenAPI specification schema_name = None + # Values that count as "empty" for per-donor completeness scoring. + # NOTE: "Not available" is intentionally NOT included: it is treated as a + # valid, complete answer for completeness purposes. (This differs from the + # required_but_missing / cases_missing_data stats in validate_schema, which + # still treat "Not available" as missing.) + EMPTY_VALUES = (None, "") + # schema for validation beyond jsonschema checks. Each schema that is described in the model gets an entry. validation_schema = { "examples": { # There should be a method `validate_examples` implemented to validate conditionals @@ -115,7 +122,15 @@ def __init__(self, url, simple=False): self.template = self.add_default_mappings(raw_template) - def warn(self, message): + def warn(self, message, conditional_required=True): + """Record a validation warning. + + `conditional_required` (default True) marks the warning as indicating a + required or conditionally-required field/object that is missing, so it + counts against per-donor 'fulsome' completeness. Set it False for soft + notes and cross-field consistency warnings that are not about a missing + requirement. The warning is attributed to the current donor via + stack_location[0] so the completeness engine can look it up.""" prefix = " > ".join(self.stack_location) if prefix.strip() == "": prefix = "" @@ -123,6 +138,11 @@ def warn(self, message): prefix += ": " message = prefix + message self.validation_warnings.append(f"{message}") + if conditional_required and self.stack_location: + donor = self.stack_location[0] + if not hasattr(self, "_conditional_gaps"): + self._conditional_gaps = {} + self._conditional_gaps.setdefault(donor, []).append(message) def fail(self, message): @@ -324,6 +344,8 @@ def validate_ingest_map(self, map_json): self.statistics["required_but_missing"] = {} self.statistics["schemas_used"] = [] self.statistics["cases_missing_data"] = [] + self.statistics["donor_completeness"] = {} + self._conditional_gaps = {} # donor_id -> [conditional-requirement warnings] for key in self.validation_schema.keys(): self.validation_schema[key]["extra_args"] = { @@ -333,6 +355,9 @@ def validate_ingest_map(self, map_json): for x in range(0, len(map_json[root_schema])): self.validate_jsonschema(map_json[root_schema][x], x) self.validate_schema(root_schema, map_json[root_schema][x]) + record = self.calculate_donor_completeness(map_json[root_schema][x]) + if record is not None: + self.statistics["donor_completeness"][record["donor_id"]] = record for schema in self.identifiers: most_common = self.identifiers[schema].most_common() if most_common[0][1] > 1: @@ -411,7 +436,10 @@ def validate_schema(self, schema_name, map_json): } self.statistics["required_but_missing"][schema_name][f]["total"] += 1 if f not in map_json or map_json[f] == "Not available": - self.warn(f"{f} required for {schema_name}") + # Flat required-field gaps are handled by the completeness + # engine's _required_complete (which, unlike this check, treats + # "Not available" as a valid value), so don't double-count here. + self.warn(f"{f} required for {schema_name}", conditional_required=False) self.statistics["required_but_missing"][schema_name][f]["missing"] += 1 if case not in self.statistics["cases_missing_data"]: self.statistics["cases_missing_data"].append(case) @@ -432,3 +460,145 @@ def validate_schema(self, schema_name, map_json): else: self.validate_schema(ns, map_json[ns]) self.stack_location.pop() + + # ------------------------------------------------------------------ # + # Per-donor completeness # + # ------------------------------------------------------------------ # + # Two orthogonal axes per donor: + # * tier ("A"/"B"/None) -- driven by sample_registration composition + # * level ("fulsome"/"minimal"/"incomplete") -- driven by field validity + # A schema subclass opts in by defining `tier_criteria`, `minimal_criteria` + # and (optionally) `conditional_fields` plus the `_sample_kind` classifier. + # Schemas that don't define these get None (feature disabled for them). + + def _field_present(self, obj, field): + """True if `field` has a non-empty value on `obj`. + + "Not available" counts as a valid, complete value (see EMPTY_VALUES).""" + return isinstance(obj, dict) and field in obj and obj[field] not in self.EMPTY_VALUES + + def _find_objects(self, node, key): + """Return every object appearing under `key` anywhere in the donor tree.""" + found = [] + if isinstance(node, dict): + for k, v in node.items(): + if k == key: + found.extend(v if isinstance(v, list) else [v]) + found.extend(self._find_objects(v, key)) + elif isinstance(node, list): + for item in node: + found.extend(self._find_objects(item, key)) + return [o for o in found if isinstance(o, dict)] + + def _evaluate_tier(self, donor): + """Classify a donor's sample composition into a single, exclusive tier. + + Tier criteria are cumulative (Tier A's samples are a superset of Tier B's), + so a donor that qualifies for A also qualifies for B. The returned `tier` + resolves this in favour of the highest tier, so a Tier A donor is counted + ONLY as A and never toward the Tier B total. The `criteria_met` dict is + diagnostic (overlapping) and must not be used for tallying totals.""" + samples = self._find_objects(donor, "sample_registrations") + counts = {} + for s in samples: + kind = self._sample_kind(s) + if kind: + counts[kind] = counts.get(kind, 0) + 1 + criteria_met = { + tier: all(counts.get(k, 0) >= n for k, n in req.items()) + for tier, req in self.tier_criteria.items() + } + # highest satisfied tier wins; assumes tier_criteria ordered strongest-first + tier = next((t for t in self.tier_criteria if criteria_met.get(t)), None) + return tier, counts, criteria_met + + def _evaluate_minimal(self, donor): + """Check the reduced 'minimal' field set on every existing instance.""" + unmet = [] + for schema_name, fields in self.minimal_criteria.items(): + instances = [donor] if schema_name == self._root_schema() \ + else self._find_objects(donor, schema_name) + id_key = self.validation_schema.get(schema_name, {}).get("id") + for inst in instances: + ident = inst.get(id_key, "?") if id_key else "?" + unmet += [f"{schema_name}[{ident}].{f}" + for f in fields if not self._field_present(inst, f)] + return (len(unmet) == 0), unmet + + def _required_complete(self, schema_name, obj, unmet, prefix=""): + """Recursively check all required_fields across the donor tree.""" + spec = self.validation_schema[schema_name] + id_key = spec["id"] + ident = obj.get(id_key, "?") if id_key else "?" + here = f"{prefix}{schema_name}[{ident}]" + for f in spec["required_fields"]: + if not self._field_present(obj, f): + unmet.append(f"{here}.{f}") + for ns in spec["nested_schemas"]: + for child in (obj.get(ns) or []): + self._required_complete(ns, child, unmet, prefix=f"{here} > ") + + def _evaluate_required_instances(self, donor): + """Check that required nested objects exist (e.g. >= 1 treatment). + + Driven by the optional `required_instances` list on the schema subclass, + each entry being {"key": , "min": }. Objects are counted + anywhere in the donor tree via _find_objects.""" + unmet = [] + for spec in getattr(self, "required_instances", []): + found = len(self._find_objects(donor, spec["key"])) + need = spec.get("min", 1) + if found < need: + unmet.append( + f"missing required object: {spec['key']} (found {found}, need >= {need})") + return unmet + + def _evaluate_fulsome(self, donor, donor_id): + """Fulsome = every required field present (across the whole tree) AND + every conditionally-required field/object present. + + Flat required fields are checked directly by _required_complete (which + honours "Not available" as a valid value). The conditional requirements + are taken from the validation pass itself: every `warn(...)` raised with + conditional_required=True during this donor's validation is a missing + conditional requirement. This means *all* conditional rules in the + validate_* methods are covered automatically and stay in sync as the + model evolves -- no rule needs to be re-listed here. + + NOTE: relies on validate_schema having run for this donor first (it does, + in validate_ingest_map, immediately before calculate_donor_completeness).""" + unmet = [] + self._required_complete(self._root_schema(), donor, unmet) + unmet += getattr(self, "_conditional_gaps", {}).get(donor_id, []) + unmet += self._evaluate_required_instances(donor) + return (len(unmet) == 0), unmet + + def _root_schema(self): + return list(self.validation_schema.keys())[0] + + def calculate_donor_completeness(self, donor): + """Return a per-donor completeness record, or None if this schema does + not define completeness criteria.""" + if getattr(self, "tier_criteria", None) is None \ + or getattr(self, "minimal_criteria", None) is None: + return None + + id_field = self.validation_schema[self._root_schema()]["id"] + donor_id = donor.get(id_field) + tier, sample_counts, tier_criteria_met = self._evaluate_tier(donor) + minimal_ok, minimal_unmet = self._evaluate_minimal(donor) + # conditional gaps are keyed by stack_location[0] == str(donor_id) + fulsome_ok, fulsome_unmet = self._evaluate_fulsome(donor, str(donor_id)) + level = "fulsome" if fulsome_ok else "minimal" if minimal_ok else "incomplete" + return { + "donor_id": donor_id, + "tier": tier, # "A" / "B" / None (exclusive) + "level": level, # fulsome / minimal / incomplete + "type": (f"Tier {tier} {level}" if tier else f"untiered {level}"), + "tier_criteria_met": tier_criteria_met, # diagnostic only (overlapping) + "sample_counts": sample_counts, + "minimal_complete": minimal_ok, + "fulsome_complete": fulsome_ok, + "minimal_unmet": minimal_unmet, + "fulsome_unmet": fulsome_unmet, + } diff --git a/tests/raw_data/Biomarker.csv b/tests/raw_data/Biomarker.csv index 3d1fa750..2a70823e 100644 --- a/tests/raw_data/Biomarker.csv +++ b/tests/raw_data/Biomarker.csv @@ -11,3 +11,5 @@ DONOR_3,,,TR_3,1/5/2020,,7,327,103,8,Positive,65.8,Not applicable,23.6,Not avail DONOR_3,,,TR_3,1/5/2020,,7,207,112,9,Positive,73.5,Not available,72.8,Cannot be determined,Not applicable,Not applicable,Negative,, DONOR_3,,PD_3,,1/5/2020,,6,304,-99,9,,1.3,Negative,15.1,Not available,Not applicable,Not applicable,Positive,HPV16|HPV39, DONOR_5,,PD_5,,1/5/2020,,4,245,46,11,Cannot be determined,59.9,Not available,-99,Not applicable,Cannot be determined,Negative,Cannot be determined,, +CMPLT_COV1,,,,1/5/2018,,,5,,,,,,,,,,,, +CMPLT_COV2,,,,1/5/2018,,,5,,,,,,,,,,,, diff --git a/tests/raw_data/Comorbidity.csv b/tests/raw_data/Comorbidity.csv index 08e14a35..07c5c305 100644 --- a/tests/raw_data/Comorbidity.csv +++ b/tests/raw_data/Comorbidity.csv @@ -1,2 +1,4 @@ submitter_donor_id,prior_malignancy,laterality_of_prior_malignancy,age_at_comorbidity_diagnosis,comorbidity_type_code,comorbidity_treatment_status,comorbidity_treatment, DONOR_1,Yes,Right,44,C34.9,Not available,Ablation, +CMPLT_COV1,,,,C34.9,,, +CMPLT_COV2,,,,C34.9,,, diff --git a/tests/raw_data/Donor.csv b/tests/raw_data/Donor.csv index 7fdbde05..4a6b3afe 100644 --- a/tests/raw_data/Donor.csv +++ b/tests/raw_data/Donor.csv @@ -5,3 +5,10 @@ DONOR_3,TEST_1,PD_3,Lost contact,4/6/2022,No,,7/12/1945,,Non-binary,Other,month DONOR_4,TEST_1,,,,Yes,Not available,1/6/1984,239,Man,Male,month DONOR_5,TEST_2,PD_5,Not available,1/6/2022,Yes,,15/2/1984,,Woman,Female,month DONOR_6,TEST_2,PD_6,Withdrew from study,1/6/2022,No,,12/9/1974,,Non-binary,Other,month +CMPLT_AF,TEST_1,,,,No,,6/1/1960,,Woman,Female,month +CMPLT_BF,TEST_1,,,,No,,6/1/1961,,Man,Male,month +CMPLT_AM,TEST_1,,,,No,,6/1/1962,,Woman,Female,month +CMPLT_BM,TEST_1,,,,No,,6/1/1963,,Man,Male,month +CMPLT_INC,TEST_1,,,,No,,6/1/1964,,Non-binary,Other,month +CMPLT_COV1,TEST_1,,,,No,,1/1/1970,,Woman,Female,month +CMPLT_COV2,TEST_1,,,,No,,1/1/1970,,Woman,Female,month diff --git a/tests/raw_data/Exposure.csv b/tests/raw_data/Exposure.csv new file mode 100644 index 00000000..f9832ca7 --- /dev/null +++ b/tests/raw_data/Exposure.csv @@ -0,0 +1,3 @@ +submitter_donor_id,tobacco_smoking_status,tobacco_type,pack_years_smoked +CMPLT_COV1,Lifelong non-smoker (<100 cigarettes smoked in lifetime),, +CMPLT_COV2,Lifelong non-smoker (<100 cigarettes smoked in lifetime),, diff --git a/tests/raw_data/Followup.csv b/tests/raw_data/Followup.csv index 2a946bbd..c636ba44 100644 --- a/tests/raw_data/Followup.csv +++ b/tests/raw_data/Followup.csv @@ -5,3 +5,5 @@ FOLLOW_UP_3,DONOR_1,,,01/08/2022,Loco-regional progression,Distant recurrence/me FOLLOW_UP_4,DONOR_1,,,01/08/2022,Loco-regional progression,Biochemical progression,16-05-2022,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS, FOLLOW_UP_4,DONOR_6,,,01/07/2022,Loco-regional progression,Biochemical progression,16-05-2022,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS, DUPLICATE_ID,DONOR_4,,,01/07/2022,Loco-regional progression,Biochemical progression,18-05-2022,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS, +FU_CMPLT_COV1,CMPLT_COV1,,,1/6/2019,No evidence of disease,,,,,,,,,, +FU_CMPLT_COV2,CMPLT_COV2,,,1/6/2019,No evidence of disease,,,,,,,,,, diff --git a/tests/raw_data/PrimaryDiagnosis.csv b/tests/raw_data/PrimaryDiagnosis.csv index d74a45d3..ec168f8c 100644 --- a/tests/raw_data/PrimaryDiagnosis.csv +++ b/tests/raw_data/PrimaryDiagnosis.csv @@ -6,4 +6,11 @@ DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Cytology,AJCC cancer staging system,T DONOR_4,Brain,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (R-ISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS DONOR_5,Gum,PD_5,15/3/2020,C64.9,,Revised International staging system (R-ISS),T1,N0a,M0,,Left,,,,Stage IIBES DONOR_6,"Heart, mediastinum, and pleura",PD_6,1/5/2016,C02.2,Specific tumour markers,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified",,,,Stage IIIB -DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,, \ No newline at end of file +DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,, +CMPLT_AF,Breast,PD_AF,1/6/2018,C50.1,Histology of a primary tumour,Durie-Salmon staging system,,,,Stage I,Left,,,, +CMPLT_BF,Breast,PD_BF,1/6/2018,C50.1,Histology of a primary tumour,Durie-Salmon staging system,,,,Stage I,Left,,,, +CMPLT_AM,Breast,PD_AM,1/6/2018,C50.1,Histology of a primary tumour,Durie-Salmon staging system,,,,Stage I,Left,,,, +CMPLT_BM,Breast,PD_BM,1/6/2018,C50.1,Histology of a primary tumour,Durie-Salmon staging system,,,,Stage I,Left,,,, +CMPLT_INC,Breast,PD_INC,1/6/2018,C50.1,Histology of a primary tumour,Durie-Salmon staging system,,,,Stage I,Left,,,, +CMPLT_COV1,Breast,PD_COV1,1/2/2018,C50.1,Histology of a primary tumour,AJCC cancer staging system,T1,N0,M0,,Left,,,, +CMPLT_COV2,Breast,PD_COV2,1/2/2018,C50.1,Histology of a primary tumour,Durie-Salmon staging system,,,,Stage I,Left,,,, diff --git a/tests/raw_data/Radiation.csv b/tests/raw_data/Radiation.csv index e253d360..14aa591b 100644 --- a/tests/raw_data/Radiation.csv +++ b/tests/raw_data/Radiation.csv @@ -1,3 +1,5 @@ submitter_donor_id, submitter_treatment_id, radiation_therapy_modality, radiation_therapy_type, radiation_therapy_fractions, radiation_therapy_dosage, anatomical_site_irradiated, radiation_boost, reference_radiation_treatment_id, DONOR_5,TR_5, Teleradiotherapy protons (procedure), Internal, 30,-99,FINGER (INCLUDING THUMBS),Yes, REFERENCE_RADIATION_TREATMENT_2, DONOR_5,TR_5, Teleradiotherapy protons (procedure), Internal, 10,33,FINGER (INCLUDING THUMBS),No,, +CMPLT_COV1,TR_COV1,Brachytherapy (procedure),External,30,50,ABDOMEN,No,, +CMPLT_COV2,TR_COV2,Brachytherapy (procedure),External,30,50,ABDOMEN,No,, diff --git a/tests/raw_data/Sample_Registration.csv b/tests/raw_data/Sample_Registration.csv index f77fa143..c9d9157d 100644 --- a/tests/raw_data/Sample_Registration.csv +++ b/tests/raw_data/Sample_Registration.csv @@ -3,3 +3,19 @@ SAMPLE_REGISTRATION_1,DONOR_2,SPECIMEN_4,Cervical mucus,Tumour,Recurrent tumour, SAMPLE_REGISTRATION_2,DONOR_2,SPECIMEN_7,Cervical mucus,Normal,Recurrent tumour,Total DNA,Bar SAMPLE_REGISTRATION_3,DONOR_2,SPECIMEN_5,Cervical mucus,Normal,Recurrent tumour,Total DNA,Baz SAMPLE_REGISTRATION_4,DONOR_5,SPECIMEN_6,Cervical mucus,Normal,Recurrent tumour,Total DNA,Bat +SAMP_AF_TD,CMPLT_AF,SPEC_AF_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total DNA,Foo +SAMP_AF_TR,CMPLT_AF,SPEC_AF_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total RNA,Foo +SAMP_AF_ND,CMPLT_AF,SPEC_AF_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo +SAMP_BF_TD,CMPLT_BF,SPEC_BF_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total DNA,Foo +SAMP_BF_ND,CMPLT_BF,SPEC_BF_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo +SAMP_AM_TD,CMPLT_AM,SPEC_AM_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total DNA,Foo +SAMP_AM_TR,CMPLT_AM,SPEC_AM_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total RNA,Foo +SAMP_AM_ND,CMPLT_AM,SPEC_AM_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo +SAMP_BM_TD,CMPLT_BM,SPEC_BM_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total DNA,Foo +SAMP_BM_ND,CMPLT_BM,SPEC_BM_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo +SAMP_INC_ND,CMPLT_INC,SPEC_INC_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo +SAMP_COV1_TD,CMPLT_COV1,SPEC_COV1_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total DNA,Foo +SAMP_COV1_TR,CMPLT_COV1,SPEC_COV1_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total RNA,Foo +SAMP_COV1_ND,CMPLT_COV1,SPEC_COV1_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo +SAMP_COV2_TD,CMPLT_COV2,SPEC_COV2_T,Blood derived - peripheral blood,Tumour,Primary tumour,Total DNA,Foo +SAMP_COV2_ND,CMPLT_COV2,SPEC_COV2_N,Blood derived - peripheral blood,Normal,Normal,Total DNA,Foo diff --git a/tests/raw_data/Specimen.csv b/tests/raw_data/Specimen.csv index b2087472..1b11340f 100644 --- a/tests/raw_data/Specimen.csv +++ b/tests/raw_data/Specimen.csv @@ -6,3 +6,16 @@ DONOR_2,PD_2_1,SPECIMEN_4,,Durie-Salmon staging system,23/12/2021,RNA later froz DONOR_2,PD_2,SPECIMEN_5,TR_7,Durie-Salmon staging system,07/12/2020,Frozen in -70 freezer,,C15.9,,,,,,,Formalin fixed & paraffin embedded,,,,,,,,,,,,,,, DONOR_5,PD_5,SPECIMEN_6,,Durie-Salmon staging system,20/04/2021,Cut slide,8124/9,C15.9,,Not done,IASLC grading system,G3,51-100%,Pathology estimate by percent nuclei,Formalin fixed - buffered,,,,,,,,,,,,,,, DONOR_2,PD_2_1,SPECIMEN_7,,Durie-Salmon staging system,23/02/2021,RNA later frozen,,C43.9,,,,,,,Cryopreservation - other,,,,,,,,,,,,,,, +CMPLT_AF,PD_AF,SPEC_AF_T,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,Yes,Yes,Two-tier grading system,Low grade,51-100%,Image analysis,,,,,,,,,,,,,,,, +CMPLT_AF,PD_AF,SPEC_AF_N,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, +CMPLT_BF,PD_BF,SPEC_BF_T,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,Yes,Yes,Two-tier grading system,Low grade,51-100%,Image analysis,,,,,,,,,,,,,,,, +CMPLT_BF,PD_BF,SPEC_BF_N,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, +CMPLT_AM,PD_AM,SPEC_AM_T,,,1/8/2018,,,C50.1,Yes,Yes,Two-tier grading system,Low grade,51-100%,Image analysis,,,,,,,,,,,,,,,, +CMPLT_AM,PD_AM,SPEC_AM_N,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, +CMPLT_BM,PD_BM,SPEC_BM_T,,,1/8/2018,,,C50.1,Yes,Yes,Two-tier grading system,Low grade,51-100%,Image analysis,,,,,,,,,,,,,,,, +CMPLT_BM,PD_BM,SPEC_BM_N,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, +CMPLT_INC,PD_INC,SPEC_INC_N,,,1/8/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, +CMPLT_COV1,PD_COV1,SPEC_COV1_T,,,1/3/2018,Frozen in liquid nitrogen,,C50.1,Yes,Yes,Two-tier grading system,Low grade,51-100%,Image analysis,,,,,,,,,,,,,,,, +CMPLT_COV1,PD_COV1,SPEC_COV1_N,,,1/3/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, +CMPLT_COV2,PD_COV2,SPEC_COV2_T,,,1/3/2018,Frozen in liquid nitrogen,,C50.1,Yes,Yes,Two-tier grading system,Low grade,51-100%,Image analysis,,,,,,,,,,,,,,,, +CMPLT_COV2,PD_COV2,SPEC_COV2_N,,,1/3/2018,Frozen in liquid nitrogen,,C50.1,,,,,,,,,,,,,,,,,,,,,, diff --git a/tests/raw_data/Surgery.csv b/tests/raw_data/Surgery.csv index ee22ddd5..95b8ac8d 100644 --- a/tests/raw_data/Surgery.csv +++ b/tests/raw_data/Surgery.csv @@ -1,3 +1,5 @@ submitter_donor_id,submitter_specimen_id,submitter_treatment_id,surgery_reference_database,surgery_reference_identifier,surgery_type,surgery_site,surgery_location,tumour_length,tumour_width,greatest_dimension_tumour,tumour_focality,residual_tumour_classification,margin_types_involved,margin_types_not_involved,lymphovascular_invasion,margin_types_not_assessed,perineural_invasion DONOR_2,SPECIMEN_4,TR_7,SNOMED,178294003,Axillary lymph nodes sampling,C14,Primary,9,7,5,Unifocal,R2,Distal margin|Circumferential resection margin,,Absent,Not available,Absent DONOR_6,SPECIMEN_43,TR_9,NCIt,C15361,Fine needle aspiration biopsy,C14,Primary,9,7,5,Unifocal,R2,Distal margin|Circumferential resection margin,,Absent,Not available,Absent +CMPLT_COV1,SPEC_COV1_T,TR_COV1,SNOMED,178294003,Excision,,,,,,,,,,,, +CMPLT_COV2,SPEC_COV2_T,TR_COV2,SNOMED,178294003,Excision,,,,,,,,,,,, diff --git a/tests/raw_data/SystemicTherapy.csv b/tests/raw_data/SystemicTherapy.csv index 7f0909c5..8a1dfc44 100644 --- a/tests/raw_data/SystemicTherapy.csv +++ b/tests/raw_data/SystemicTherapy.csv @@ -4,3 +4,5 @@ DONOR_2,TR_2,Chemotherapy,NIVOLUMAB,87333,mg/m2,150,111,PubChem,5,2,1/04/2020,1/ DONOR_3,TR_3,Hormone therapy,degarelix,46475,ug/m2,179,97,PubChem,6,3,10/12/2020,19/12/2021 DONOR_4,TR_4,Immunotherapy,Pembrolizumab,4459876,IU/kg,95,160,RxNorm,4,2,1/3/2021,12/12/2021 DONOR_2,TR_8,Immunotherapy,Pexidartinib,8836851,ug/m2,197,183,PubChem,6,1,9/5/2021,6/6/2023 +CMPLT_COV1,TR_COV1,Chemotherapy,Cisplatin,12345,,,,RxNorm,,,1/3/2018,1/6/2018 +CMPLT_COV2,TR_COV2,Chemotherapy,Cisplatin,12345,,,,RxNorm,,,1/3/2018,1/6/2018 diff --git a/tests/raw_data/Treatment.csv b/tests/raw_data/Treatment.csv index 6b5d8543..da6e6c7c 100644 --- a/tests/raw_data/Treatment.csv +++ b/tests/raw_data/Treatment.csv @@ -9,3 +9,7 @@ TR_7,DONOR_2,PD_2_1,Surgery,Yes,01/02/2021,01/02/2022,Diagnostic,,Progressive di TR_8,DONOR_2,PD_2_1,Systemic therapy,No,01/03/2021,01/03/2022,Forensic,AML Response Criteria,Immune confirmed progressive disease (iCPD),Other TR_9,DONOR_6,PD_6,Surgery,No,01/02/2021,01/02/2022,Diagnostic,Blazer score,Progressive disease, TR_10,DONOR_5,PD_5,Systemic therapy,No,01/02/2021,01/02/2022,Forensic,Response Assessment in Neuro-Oncology (RANO),, +TR_AF,CMPLT_AF,PD_AF,Bone marrow transplant,Yes,1/7/2018,1/9/2018,Curative,,, +TR_BF,CMPLT_BF,PD_BF,Bone marrow transplant,Yes,1/7/2018,1/9/2018,Curative,,, +TR_COV1,CMPLT_COV1,PD_COV1,Systemic therapy|Radiation therapy|Surgery,Yes,1/2/2018,1/12/2018,Curative,,, +TR_COV2,CMPLT_COV2,PD_COV2,Systemic therapy|Radiation therapy|Surgery,Yes,1/2/2018,1/12/2018,Curative,,, diff --git a/tests/test2mohv3.csv b/tests/test2mohv3.csv index 0a1658e3..49a314d0 100644 --- a/tests/test2mohv3.csv +++ b/tests/test2mohv3.csv @@ -132,11 +132,11 @@ DONOR.INDEX.comorbidities.INDEX.age_at_comorbidity_diagnosis_not_available, {num DONOR.INDEX.comorbidities.INDEX.comorbidity_type_code, {single_val(Comorbidity.comorbidity_type_code)} DONOR.INDEX.comorbidities.INDEX.comorbidity_treatment_status, {single_val(Comorbidity.comorbidity_treatment_status)} DONOR.INDEX.comorbidities.INDEX.comorbidity_treatment, {single_val(Comorbidity.comorbidity_treatment)} -DONOR.INDEX.exposures.INDEX, {indexed_on(EXPOSURES_SHEET.submitter_donor_id)} -DONOR.INDEX.exposures.INDEX.tobacco_smoking_status, {single_val(EXPOSURES_SHEET.tobacco_smoking_status)} -DONOR.INDEX.exposures.INDEX.tobacco_type, {pipe_delim(EXPOSURES_SHEET.tobacco_type)} -DONOR.INDEX.exposures.INDEX.pack_years_smoked, {set_neg_99_blank_int(EXPOSURES_SHEET.pack_years_smoked)} -DONOR.INDEX.exposures.INDEX.pack_years_smoked_not_available, {numeric_not_available(EXPOSURES_SHEET.pack_years_smoked)} +DONOR.INDEX.exposures.INDEX, {indexed_on(Exposure.submitter_donor_id)} +DONOR.INDEX.exposures.INDEX.tobacco_smoking_status, {single_val(Exposure.tobacco_smoking_status)} +DONOR.INDEX.exposures.INDEX.tobacco_type, {pipe_delim(Exposure.tobacco_type)} +DONOR.INDEX.exposures.INDEX.pack_years_smoked, {set_neg_99_blank_int(Exposure.pack_years_smoked)} +DONOR.INDEX.exposures.INDEX.pack_years_smoked_not_available, {numeric_not_available(Exposure.pack_years_smoked)} DONOR.INDEX.biomarkers.INDEX, {indexed_on(Biomarker.submitter_donor_id)} DONOR.INDEX.biomarkers.INDEX.submitter_specimen_id, {single_val(Biomarker.submitter_specimen_id)} DONOR.INDEX.biomarkers.INDEX.submitter_primary_diagnosis_id, {single_val(Biomarker.submitter_primary_diagnosis_id)} diff --git a/tests/test_data_ingest.py b/tests/test_data_ingest.py index ae24b287..48633257 100644 --- a/tests/test_data_ingest.py +++ b/tests/test_data_ingest.py @@ -31,8 +31,9 @@ def packets(): def test_csv_convert(packets): - # there are 6 donors - assert len(packets) == 6 + # 6 original sample donors + 5 completeness fixtures (CMPLT_AF/BF/AM/BM/INC) + # + 2 full-coverage fulsome donors (CMPLT_COV1/COV2) + assert len(packets) == 13 def test_external_mapping(packets): @@ -79,7 +80,11 @@ def test_donor_2(packets): def test_validation(packets, schema): - schema.validate_ingest_map({"donors": packets}) + # Scope validation to the original sample donors so the expected warning / + # error lists below are unaffected by the CMPLT_* completeness fixtures. + original_ids = {"DONOR_1", "DONOR_2", "DONOR_3", "DONOR_4", "DONOR_5", "DONOR_6"} + original = [p for p in packets if p["submitter_donor_id"] in original_ids] + schema.validate_ingest_map({"donors": original}) print(schema.validation_warnings) warnings = [ "DONOR_2 > PD_2: date_of_diagnosis required for primary_diagnoses", @@ -144,3 +149,52 @@ def test_multisheet_mapping(packets): assert len(s["multisheet"]["placeholder"]["submitter_specimen_id"]["Sample_Registration"]) == 0 assert len(s["multisheet"]["placeholder"]["extra"]["Sample_Registration"]) == 0 + +# Per-donor tier/level completeness summary over the full cohort. +# The tests/raw_data fixtures include five CMPLT_* donors purpose-built to land +# in each summary bucket: +# CMPLT_AF -> Tier A, fulsome CMPLT_BF -> Tier B, fulsome +# CMPLT_AM -> Tier A, minimal CMPLT_BM -> Tier B, minimal +# CMPLT_INC -> untiered (single normal DNA sample) -> incomplete +def test_completeness_summary(packets, schema): + schema.validate_ingest_map({"donors": packets}) + summary = CSVConvert.summarize_completeness(schema.statistics["donor_completeness"]) + + assert summary["total_donors"] == 13 + # each axis partitions all donors exactly once + assert (summary["tier_a_min_clinical_complete"] + + summary["tier_b_min_clinical_complete"] + + summary["incomplete_min_donors"]) == 13 + assert (summary["tier_a_full_clinical_complete"] + + summary["tier_b_full_clinical_complete"] + + summary["incomplete_full_donors"]) == 13 + # the CMPLT_* donors populate each category (Tier A donors are not also + # counted toward Tier B); original donors only add to the incomplete buckets + assert summary["tier_a_min_clinical_complete"] == 3 # CMPLT_AF, CMPLT_AM, CMPLT_COV1 + assert summary["tier_b_min_clinical_complete"] == 3 # CMPLT_BF, CMPLT_BM, CMPLT_COV2 + assert summary["tier_a_full_clinical_complete"] == 2 # CMPLT_AF, CMPLT_COV1 + assert summary["tier_b_full_clinical_complete"] == 2 # CMPLT_BF, CMPLT_COV2 + assert summary["incomplete_min_donors"] >= 1 # CMPLT_INC (+ originals) + assert summary["incomplete_full_donors"] >= 3 # CMPLT_AM, CMPLT_BM, CMPLT_INC (+ originals) + + +# CMPLT_COV1 / CMPLT_COV2 populate every object type in the model, with all +# required and conditionally-required fields filled, so they should come out +# fulsome complete. This guards against the required-field lists drifting out of +# sync with the model (a newly-required field would make these donors fail). +def test_full_object_coverage_donors_are_fulsome(packets, schema): + schema.validate_ingest_map({"donors": packets}) + dc = schema.statistics["donor_completeness"] + for donor_id in ("CMPLT_COV1", "CMPLT_COV2"): + assert dc[donor_id]["fulsome_complete"] is True, dc[donor_id]["fulsome_unmet"] + assert dc[donor_id]["fulsome_unmet"] == [] + + cov = next(p for p in packets if p["submitter_donor_id"] == "CMPLT_COV1") + # donor-level objects + for key in ("primary_diagnoses", "followups", "biomarkers", "comorbidities", "exposures"): + assert cov.get(key), f"CMPLT_COV1 missing {key}" + pd = cov["primary_diagnoses"][0] + assert pd.get("specimens") and pd["specimens"][0].get("sample_registrations") + tr = pd["treatments"][0] + for key in ("systemic_therapies", "radiations", "surgeries"): + assert tr.get(key), f"CMPLT_COV1 treatment missing {key}" diff --git a/tests/test_donor_completeness.py b/tests/test_donor_completeness.py new file mode 100644 index 00000000..76873f77 --- /dev/null +++ b/tests/test_donor_completeness.py @@ -0,0 +1,271 @@ +"""Tests for per-donor tier/level completeness (BaseSchema completeness engine). + +Offline by design: MoHSchemaV3.__init__ fetches the OpenAPI schema over the +network, but the completeness engine only needs the class-level criteria and +the validation pass. We instantiate via __new__ and supply a permissive +json_schema ({} validates anything) so validate_ingest_map runs without network. + +'fulsome' completeness is derived from the validation pass, so these tests run +the donor(s) through schema.validate_ingest_map and read the resulting +statistics["donor_completeness"], rather than calling the engine in isolation. +""" + +import pytest + +from clinical_etl.mohschemav3 import MoHSchemaV3 +from clinical_etl.CSVConvert import summarize_completeness, build_completeness_failures + + +@pytest.fixture +def schema(): + s = MoHSchemaV3.__new__(MoHSchemaV3) # bypass network __init__ + s.validation_warnings = [] + s.validation_errors = [] + s.statistics = {} + s.identifiers = {} + s.stack_location = [] + s.json_schema = {} # permissive: no jsonschema errors + return s + + +def evaluate(schema, *donors): + """Run donors through the full validation pass and return the per-donor + completeness records keyed by donor id.""" + schema.validate_ingest_map({"donors": list(donors)}) + return schema.statistics["donor_completeness"] + + +# --- fixture builders ------------------------------------------------------ # + +def sample(tn, stype, sid): + return { + "submitter_sample_id": sid, + "tumour_normal_designation": tn, + "specimen_tissue_source": "Blood derived", + "specimen_type": "Primary tumour", + "sample_type": stype, + } + + +def tumour_dna(sid="S_TDNA"): + return sample("Tumour", "Total DNA", sid) + + +def tumour_rna(sid="S_TRNA"): + return sample("Tumour", "Total RNA", sid) + + +def normal_dna(sid="S_NDNA"): + return sample("Normal", "Total DNA", sid) + + +def treatment(): + # treatment_type that does not require nested therapy/radiation/surgery objects + return { + "submitter_treatment_id": "TR1", + "treatment_type": ["Bone marrow transplant"], + "is_primary_treatment": "Yes", + "treatment_start_date": {"month_interval": 0}, + "treatment_end_date": {"month_interval": 1}, + "treatment_intent": "Curative", + } + + +def build_donor(donor_id="DONOR", samples=None, deceased="No", + with_specimen_storage=True, with_staging=True, + with_tumour_specimen_fields=True, with_treatment=True): + """Build a donor that is fully (fulsome) complete by default; flip a knob + to introduce a specific gap.""" + if samples is None: + samples = [tumour_dna(), tumour_rna(), normal_dna()] + specimen = { + "submitter_specimen_id": "SP1", + "specimen_collection_date": {"month_interval": 0}, + "specimen_anatomic_location": "C50", + "sample_registrations": samples, + } + if with_specimen_storage: + specimen["specimen_storage"] = "Frozen in liquid nitrogen" + if with_tumour_specimen_fields: + specimen.update({ + "reference_pathology_confirmed_diagnosis": "Yes", + "reference_pathology_confirmed_tumour_presence": "Yes", + "tumour_grading_system": "Two-tier grading system", + "tumour_grade": "Low grade", + "percent_tumour_cells_range": "51-100%", + "percent_tumour_cells_measurement_method": "Image analysis", + }) + primary_diagnosis = { + "submitter_primary_diagnosis_id": "PD1", + "date_of_diagnosis": {"month_interval": 0}, + "cancer_type_code": "C50.1", + "primary_site": "Breast", + "basis_of_diagnosis": "Histology of primary tumour", + "specimens": [specimen], + } + if with_treatment: + primary_diagnosis["treatments"] = [treatment()] + if with_staging: + primary_diagnosis["clinical_tumour_staging_system"] = "Revised International staging system (R-ISS)" + primary_diagnosis["clinical_stage_group"] = "Stage I" + return { + "submitter_donor_id": donor_id, + "gender": "Woman", + "sex_at_birth": "Female", + "date_of_birth": {"month_interval": 0}, + "date_resolution": "month", + "is_deceased": deceased, + "program_id": "PROGRAM_1", + "primary_diagnoses": [primary_diagnosis], + } + + +# --- _field_present: "Not available" counts as complete -------------------- # + +def test_not_available_is_a_valid_value(schema): + assert schema._field_present({"x": "Not available"}, "x") is True + assert schema._field_present({"x": "Woman"}, "x") is True + assert schema._field_present({"x": ""}, "x") is False + assert schema._field_present({"x": None}, "x") is False + assert schema._field_present({}, "x") is False + + +# --- tier classification (exclusive) --------------------------------------- # + +def test_tier_a(schema): + rec = evaluate(schema, build_donor())["DONOR"] + assert rec["tier"] == "A" + assert rec["sample_counts"] == {"tumour_dna": 1, "tumour_rna": 1, "normal_dna": 1} + assert rec["tier_criteria_met"] == {"A": True, "B": True} # diagnostic overlap only + + +def test_tier_b(schema): + rec = evaluate(schema, build_donor(samples=[tumour_dna(), normal_dna()]))["DONOR"] + assert rec["tier"] == "B" + assert rec["tier_criteria_met"] == {"A": False, "B": True} + + +def test_tier_none_when_composition_incomplete(schema): + rec = evaluate(schema, build_donor(samples=[tumour_dna()]))["DONOR"] + assert rec["tier"] is None + + +def test_summary_buckets(schema): + recs = evaluate( + schema, + # Tier A, fulsome + build_donor(donor_id="DONOR_AF"), + # Tier B, fulsome + build_donor(donor_id="DONOR_BF", samples=[tumour_dna(), normal_dna()]), + # Tier A, minimal only (missing conditional staging -> not fulsome) + build_donor(donor_id="DONOR_AM", with_staging=False), + # No qualifying tier (single tumour DNA sample) + build_donor(donor_id="DONOR_N", samples=[tumour_dna()]), + ) + summary = summarize_completeness(recs) + assert summary["total_donors"] == 4 + # minimal partition (sums to 4); Tier A donor never counted toward Tier B + assert summary["tier_a_min_clinical_complete"] == 2 # AF, AM + assert summary["tier_b_min_clinical_complete"] == 1 # BF + assert summary["incomplete_min_donors"] == 1 # N + # fulsome partition (sums to 4) + assert summary["tier_a_full_clinical_complete"] == 1 # AF + assert summary["tier_b_full_clinical_complete"] == 1 # BF + assert summary["incomplete_full_donors"] == 2 # AM (minimal only), N + + +# --- fulsome vs minimal ---------------------------------------------------- # + +def test_fully_complete_donor_is_fulsome(schema): + rec = evaluate(schema, build_donor())["DONOR"] + assert rec["fulsome_unmet"] == [] + assert rec["fulsome_complete"] is True + assert rec["minimal_complete"] is True + assert rec["level"] == "fulsome" + assert rec["type"] == "Tier A fulsome" + + +def test_missing_flat_required_breaks_fulsome(schema): + # specimen_storage is required but is not part of the minimal set + rec = evaluate(schema, build_donor(with_specimen_storage=False))["DONOR"] + assert rec["minimal_complete"] is True + assert rec["fulsome_complete"] is False + assert rec["level"] == "minimal" + assert any("specimen_storage" in u for u in rec["fulsome_unmet"]) + + +def test_missing_treatment_breaks_fulsome(schema): + # every donor must have >= 1 treatment object (required_instances) + rec = evaluate(schema, build_donor(with_treatment=False))["DONOR"] + assert rec["fulsome_complete"] is False + assert any("treatments" in u for u in rec["fulsome_unmet"]) + assert rec["minimal_complete"] is True # treatment existence is not a minimal criterion + + +def test_missing_staging_is_a_conditional_gap(schema): + # conditional requirement raised in validate_primary_diagnoses + rec = evaluate(schema, build_donor(with_staging=False))["DONOR"] + assert rec["fulsome_complete"] is False + assert any("clinical_tumour_staging_system" in u or "staging" in u + for u in rec["fulsome_unmet"]) + assert rec["minimal_complete"] is True # staging not in the minimal set + + +def test_missing_tumour_specimen_fields_is_a_conditional_gap(schema): + # conditional requirement raised in validate_specimens for Tumour samples + rec = evaluate(schema, build_donor(with_tumour_specimen_fields=False))["DONOR"] + assert rec["fulsome_complete"] is False + assert any("Tumour specimens require" in u for u in rec["fulsome_unmet"]) + assert rec["minimal_complete"] is True + + +def test_deceased_without_death_fields_is_a_conditional_gap(schema): + rec = evaluate(schema, build_donor(deceased="Yes"))["DONOR"] + assert rec["fulsome_complete"] is False + assert any("cause_of_death" in u for u in rec["fulsome_unmet"]) + assert any("date_of_death" in u for u in rec["fulsome_unmet"]) + assert rec["minimal_complete"] is True # death fields not in the minimal set + + +# --- "Not available" rule flows through fulsome ---------------------------- # + +def test_not_available_keeps_donor_fulsome(schema): + donor = build_donor() + donor["gender"] = "Not available" + rec = evaluate(schema, donor)["DONOR"] + assert rec["fulsome_complete"] is True + + +def test_blank_value_breaks_fulsome(schema): + donor = build_donor() + donor["gender"] = "" + rec = evaluate(schema, donor)["DONOR"] + assert rec["fulsome_complete"] is False + assert any(u.endswith(".gender") for u in rec["fulsome_unmet"]) + + +# --- detailed failure report ----------------------------------------------- # + +def test_completeness_failures_report(schema): + recs = evaluate( + schema, + build_donor(donor_id="DONOR_AF"), # fully complete + build_donor(donor_id="DONOR_AM", with_staging=False), # tier A, not fulsome + build_donor(donor_id="DONOR_N", samples=[tumour_dna()]), # untiered + ) + report = build_completeness_failures(recs, schema.tier_criteria) + + assert report["total_donors"] == 3 + assert report["failing_donors"] == 2 + ids = {d["donor_id"] for d in report["donors"]} + assert "DONOR_AF" not in ids # fully complete -> excluded + assert ids == {"DONOR_AM", "DONOR_N"} + + am = next(d for d in report["donors"] if d["donor_id"] == "DONOR_AM") + assert am["fulsome_complete"] is False + assert any("fulsome" in r.lower() for r in am["reasons"]) + assert any("staging" in u.lower() for u in am["fulsome_unmet"]) + + n = next(d for d in report["donors"] if d["donor_id"] == "DONOR_N") + assert n["tier"] is None + assert any("Sample composition" in r for r in n["reasons"])