Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions assets/placeholder_no_file.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
PLACEHOLDER
304 changes: 304 additions & 0 deletions bin/metrics_vs_depth_qc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
#!/usr/bin/env python

import json
from pathlib import Path

import click
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from scipy.stats import linregress


def load_group_samples(group_definition_file, group_name):
with open(group_definition_file, "r") as f:
groups = json.load(f)
if group_name not in groups:
raise ValueError(f"Group '{group_name}' not present in {group_definition_file}")
return [str(x) for x in groups[group_name]]


def _find_column(columns, candidates):
lower = {c.lower(): c for c in columns}
for cand in candidates:
if cand.lower() in lower:
return lower[cand.lower()]
return None


def load_depth_table(depth_gene_sample_file, samples):
depth_df = pd.read_csv(depth_gene_sample_file, sep="\t", header=0)
sample_col = _find_column(depth_df.columns, ["SAMPLE_ID", "sample"])
gene_col = _find_column(depth_df.columns, ["GENE", "gene"])
depth_col = _find_column(depth_df.columns, ["MEAN_GENE_DEPTH", "mean_gene_depth"])

if sample_col is None or gene_col is None or depth_col is None:
raise ValueError("Depth table must contain SAMPLE_ID, GENE and MEAN_GENE_DEPTH columns.")

depth_df = depth_df.rename(columns={sample_col: "SAMPLE_ID", gene_col: "GENE", depth_col: "MEAN_GENE_DEPTH"})
depth_df["SAMPLE_ID"] = depth_df["SAMPLE_ID"].astype(str)
depth_df["GENE"] = depth_df["GENE"].astype(str)
depth_df = depth_df[depth_df["SAMPLE_ID"].isin(samples)].copy().reset_index(drop=True)
depth_df["MEAN_GENE_DEPTH"] = pd.to_numeric(depth_df["MEAN_GENE_DEPTH"], errors="coerce")
return depth_df


def load_mutdensity(mutdensity_file, samples, metric_name):
mut = pd.read_csv(mutdensity_file, sep="\t", header=0)
required = {"SAMPLE_ID", "GENE", "MUTDENSITY_MB"}
if not required.issubset(set(mut.columns)):
raise ValueError(f"Mutdensity file {mutdensity_file} must contain {sorted(required)}")

mut = mut[(mut["SAMPLE_ID"].astype(str).isin(samples))
& (mut["GENE"] != "ALL_GENES")
& (mut["MUTTYPES"] == "SNV")
].copy()
mut["SAMPLE_ID"] = mut["SAMPLE_ID"].astype(str)
mut["GENE"] = mut["GENE"].astype(str)
mut["metric_value"] = pd.to_numeric(mut["MUTDENSITY_MB"], errors="coerce")
mut["metric_name"] = metric_name
if "REGIONS" in mut.columns:
mut["metric_name"] = mut["metric_name"] + "." + mut["REGIONS"].fillna("unknown").astype(str)
return mut[["SAMPLE_ID", "GENE", "metric_name", "metric_value"]]

def load_adjmutdensity(mutdensity_file, samples, metric_name):
mut = pd.read_csv(mutdensity_file, sep="\t", header=0)
required = {"SAMPLE", "GENE", "synonymous", "missense", "nonsense", "essential_splice", "truncating", "nonsynonymous_splice", "all_impacts"}
if not required.issubset(set(mut.columns)):
raise ValueError(f"Mutdensity file {mutdensity_file} must contain {sorted(required)}")

mut = mut[(mut["SAMPLE"].astype(str).isin(samples))].copy()
mut["SAMPLE_ID"] = mut["SAMPLE"].astype(str)
mut["GENE"] = mut["GENE"].astype(str)

mut_dfs = []
for impact in ["synonymous", "missense", "nonsense", "essential_splice", "truncating", "nonsynonymous_splice", "all_impacts"]:
subset_mut = mut[["SAMPLE_ID", "GENE", impact]].copy()
subset_mut["metric_value"] = subset_mut[impact]
subset_mut["metric_name"] = f"{impact}_density"
mut_dfs.append(subset_mut[["SAMPLE_ID", "GENE", "metric_name", "metric_value"]])

return mut_dfs


def load_omegas(omegas_file, samples):
omega = pd.read_csv(omegas_file, sep="\t", header=0)
sample_col = "sample"
gene_col = "gene"
dnds_col = "dnds"
if sample_col is None or gene_col is None or dnds_col is None:
raise ValueError(f"Omega file {omegas_file} must contain sample, gene and dnds/omega columns")

omegas_dfs = []
omega = omega.rename(columns={sample_col: "SAMPLE_ID", gene_col: "GENE", dnds_col: "metric_value"})
for impact in ["missense", "truncating"]:
subset_omega = omega[(omega["impact"] == impact)
& ~(omega["GENE"].astype(str).str.contains("--"))
].copy()
subset_omega["SAMPLE_ID"] = subset_omega["SAMPLE_ID"].astype(str)
subset_omega = subset_omega[subset_omega["SAMPLE_ID"].isin(samples)].copy()
subset_omega["metric_value"] = pd.to_numeric(subset_omega["metric_value"], errors="coerce")
subset_omega["metric_name"] = f"omega_gloc_{impact}"
omegas_dfs.append(subset_omega[["SAMPLE_ID", "GENE", "metric_name", "metric_value"]])

return omegas_dfs


def summarize_effect_by_gene(df):
rows = []
work = df.dropna(subset=["MEAN_GENE_DEPTH", "metric_value"]).copy()
all_groups = [("all_samples", work)] + [(s, g) for s, g in work.groupby("GENE")]
for group_name, gdf in all_groups:
n = len(gdf)
if n < 3:
rows.append(
{
"sample_scope": group_name,
"n_points": n,
"pearson_r": np.nan,
"spearman_r": np.nan,
"slope_metric_per_depth": np.nan,
"linreg_pvalue": np.nan,
}
)
continue
lr = linregress(gdf["MEAN_GENE_DEPTH"], gdf["metric_value"])
rows.append(
{
"sample_scope": group_name,
"n_points": n,
"pearson_r": gdf["MEAN_GENE_DEPTH"].corr(gdf["metric_value"], method="pearson"),
"spearman_r": gdf["MEAN_GENE_DEPTH"].corr(gdf["metric_value"], method="spearman"),
"slope_metric_per_depth": lr.slope,
"linreg_pvalue": lr.pvalue,
}
)
return pd.DataFrame(rows)


def summarize_missingness(depth_df, merged_df):
m = depth_df.merge(
merged_df[["SAMPLE_ID", "GENE", "metric_name", "metric_value"]],
on=["SAMPLE_ID", "GENE"],
how="left",
)
m["is_missing_metric"] = m["metric_value"].isna()
by_gene = (
m.groupby(by = ["GENE", "metric_name"])
.apply(
lambda g: pd.Series(
{
"n_samples_total": int(len(g)),
"n_missing_metric": int(g["is_missing_metric"].sum()),
"missing_fraction": float(g["is_missing_metric"].mean()),
"median_depth_missing": g.loc[g["is_missing_metric"], "MEAN_GENE_DEPTH"].median(),
"median_depth_nonmissing": g.loc[~g["is_missing_metric"], "MEAN_GENE_DEPTH"].median(),
"min_depth_missing": g.loc[g["is_missing_metric"], "MEAN_GENE_DEPTH"].min(),
"max_depth_missing": g.loc[g["is_missing_metric"], "MEAN_GENE_DEPTH"].max(),
}
)
)
.reset_index()
)
return by_gene.sort_values(["missing_fraction", "n_missing_metric"], ascending=[False, False]).reset_index(drop=True)


def plot_scatter_per_gene(df, group_name, metric_name, output_pdf):
genes = sorted(df["GENE"].dropna().unique().tolist())
if not genes:
return

# keep only the top 200 genes
genes = genes[:200]

sns.set_style("whitegrid")
per_page = 12
ncols = 3
nrows = 4
with PdfPages(output_pdf) as pdf:
for start in range(0, len(genes), per_page):
page_genes = genes[start : start + per_page]
fig, axes = plt.subplots(nrows, ncols, figsize=(12, 16), sharex=False, sharey=False)
axes = axes.flatten()
for i, gene in enumerate(page_genes):
ax = axes[i]
sdf = df[df["GENE"] == gene].dropna(subset=["MEAN_GENE_DEPTH", "metric_value"])
if sdf.empty:
ax.set_title(f"{gene} (no data)")
continue
sns.scatterplot(
data=sdf,
x="MEAN_GENE_DEPTH",
y="metric_value",
alpha=0.5,
s=16,
linewidth=0,
ax=ax,
)
title = f"{gene} (n={len(sdf)})"
if len(sdf) >= 3:
lr = linregress(sdf["MEAN_GENE_DEPTH"], sdf["metric_value"])
title += f" | slope={lr.slope:.2e}, p={lr.pvalue:.2e}"
line_color = "darkred" if lr.pvalue < 0.05 else "darkgrey"
sns.regplot(
data=sdf,
x="MEAN_GENE_DEPTH",
y="metric_value",
scatter=False,
line_kws={"color": line_color, "linewidth": 1.2},
ax=ax,
)

y_min, y_max = sdf["metric_value"].min(), sdf["metric_value"].max()
val_range = y_max - y_min if y_max > y_min else (abs(y_max) * 0.1 if y_max != 0 else 1.0)
ax.set_ylim(bottom=-0.05 * val_range if y_min >= 0 else y_min - 0.05 * val_range)

ax.set_title(title, fontsize=9)
ax.set_xlabel("MEAN_GENE_DEPTH")
ax.set_ylabel(metric_name)

for j in range(len(page_genes), len(axes)):
axes[j].axis("off")

fig.suptitle(f"{group_name} | {metric_name} vs depth (per gene)", fontsize=14)
fig.tight_layout(rect=[0, 0, 1, 0.97])
pdf.savefig(fig)
plt.close(fig)


@click.command()
@click.option("--mutdensity-file", required=True, type=click.Path(exists=True))
@click.option("--depth-gene-sample-file", required=True, type=click.Path(exists=True))
@click.option("--group-definition", required=True, type=click.Path(exists=True))
@click.option("--group-name", required=True, type=str)
@click.option("--output-dir", required=True, type=click.Path())
@click.option("--adjusted-mutdensity-file", required=False, type=click.Path(exists=True))
@click.option("--omegas-file", required=False, type=click.Path(exists=True))
def main(
mutdensity_file,
depth_gene_sample_file,
group_definition,
group_name,
output_dir,
adjusted_mutdensity_file,
omegas_file,
):
outdir = Path(output_dir)
outdir.mkdir(parents=True, exist_ok=True)

samples = load_group_samples(group_definition, group_name)
depth_df = load_depth_table(depth_gene_sample_file, samples)

metric_frames = [load_mutdensity(mutdensity_file, samples, "mutdensity")]
if adjusted_mutdensity_file:
try:
metric_frames.extend(load_adjmutdensity(adjusted_mutdensity_file, samples, "adjusted_mutdensity"))
except Exception as e:
print(f"Warning: skipping adjusted mutdensity file {adjusted_mutdensity_file}: {e}")
if omegas_file:
try:
metric_frames.extend(load_omegas(omegas_file, samples))
except Exception as e:
print(f"Warning: skipping omegas file {omegas_file}: {e}")

metrics_df = pd.concat(metric_frames, ignore_index=True)

status_rows = []
for metric_name in metrics_df["metric_name"].unique():
print(f"Processing metric '{metric_name}'...")

metric_specific_df = metrics_df[metrics_df["metric_name"] == metric_name].copy()
merged = depth_df.merge(metric_specific_df, on=["SAMPLE_ID", "GENE"], how="left")
merged["metric_name"] = metric_name

summary_effect = summarize_effect_by_gene(merged)
summary_effect.insert(0, "metric_name", metric_name)
summary_effect.to_csv(outdir / f"{group_name}.{metric_name}.depth_effect_summary.tsv", sep="\t", index=False)

missingness = summarize_missingness(depth_df, merged)
missingness.to_csv(outdir / f"{group_name}.{metric_name}.depth_missingness_by_gene.tsv", sep="\t", index=False)

plot_scatter_per_gene(
merged,
group_name=group_name,
metric_name=metric_name,
output_pdf=outdir / f"{group_name}.{metric_name}.depth_scatter_per_gene.pdf",
)

status_rows.append(
{
"group_name": group_name,
"metric_name": metric_name,
"n_depth_rows": int(len(depth_df)),
"n_metric_rows": int(len(metric_specific_df)),
"n_merged_nonmissing": int(merged["metric_value"].notna().sum()),
}
)

pd.DataFrame(status_rows).to_csv(outdir / f"{group_name}.metrics_vs_depth_qc.status.tsv", sep="\t", index=False)


if __name__ == "__main__":
main()
Loading