Skip to content

Commit 456fbf0

Browse files
committed
add loading and preprocessing stuff
1 parent 7c4cd2f commit 456fbf0

File tree

3 files changed

+241
-1
lines changed

3 files changed

+241
-1
lines changed

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ classifiers = [
2323
"Programming Language :: Python :: 3.10",
2424
]
2525
dynamic = ["version"]
26-
dependencies = []
26+
dependencies = [
27+
"scipy",
28+
]
2729

2830
# extras
2931
# https://peps.python.org/pep-0621/#dependencies-optional-dependencies

src/raman_analysis/loading.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
from typing import Iterable, Union
2+
3+
import numpy as np
4+
import pandas as pd
5+
import xarray as xr
6+
7+
8+
def ds2df(ds: xr.Dataset, fov: int, cell_index_start: int = 0, filename=None) -> pd.DataFrame:
9+
"""
10+
Convert a single dataset into a dataframe.
11+
12+
Parameters
13+
----------
14+
ds : xr.Dataset
15+
The dataset to convert
16+
fov : int
17+
The fov index.
18+
cell_index_start : int
19+
The offset of the cell index. This is necessary
20+
if you want to easily combine multiple datasets into a
21+
single dataframe down the road
22+
filename : str, optional
23+
The filename to add to an 'fname' column.
24+
25+
Returns
26+
-------
27+
df : dataframe
28+
"""
29+
pts_cell = ds.pts_cell
30+
mult = ds.attrs["multiplier"]
31+
indx = pd.MultiIndex.from_product(
32+
(np.arange(int(pts_cell.shape[0] / mult)) + cell_index_start, np.arange(mult)),
33+
names=("cell", "sub-cell"),
34+
)
35+
df = pd.DataFrame(ds["cell_raman"][0], index=indx)
36+
df["x"] = ds["cell_points"][:, 0]
37+
df["y"] = ds["cell_points"][:, 1]
38+
39+
cell_points = ds["cell_points"] * 2048
40+
bkd_points = ds["bkd_points"] * 2048
41+
cell_raman = ds["cell_raman"] - 608
42+
bkd_raman = ds["bkd_raman"] - 608
43+
44+
thres = 140
45+
46+
cell_com = cell_points.to_numpy().astype(int)
47+
gfp_int = np.asarray([ds["img"][1, 0, x[0], x[1]].values for x in cell_com])
48+
49+
df["gfp_int"] = gfp_int
50+
df["fov"] = fov
51+
if filename is not None:
52+
df["fname"] = filename
53+
return df
54+
55+
56+
57+
58+
def glob2df(
59+
files: Union[Iterable[str], str],
60+
conditions,
61+
threshold: float,
62+
well_number: int = 0,
63+
cell_index_start: int = 0,
64+
verbose: bool = True,
65+
) -> pd.DataFrame:
66+
"""
67+
Parameters
68+
----------
69+
files : iterable, str
70+
An iterable of filenames, or a glob string to use to find the files.
71+
conditions : tuple(str, str)
72+
The two conditions in this dataset, should be listed
73+
in order of (undyed condition, dyed condition)
74+
threshold : float
75+
The threshold gfp value for determining dyed vs undyed cell
76+
well_number : int, default 0
77+
What well number to put in the dataframe
78+
cell_index_start : int
79+
The offset of the cell index. This is necessary
80+
if you want to easily combine multiple datasets into a
81+
single dataframe down the road
82+
verbose : bool, default True
83+
Whether to print out the file names as they are opened.
84+
85+
Returns
86+
-------
87+
df : pd.DataFrame
88+
images : xr.DataArray
89+
"""
90+
if isinstance(files, str):
91+
files = glob(files)
92+
dfs = []
93+
images = []
94+
for fov, f in enumerate(files):
95+
if verbose:
96+
print(f)
97+
ds = xr.open_dataset(f)
98+
dfs.append(ds2df(ds, fov, cell_index_start, filename=f))
99+
cell_index_start = dfs[-1].index.max()[0] + 1
100+
images.append(ds["img"])
101+
images = xr.concat(images, dim="fov")
102+
df = pd.concat(dfs)
103+
104+
# determine condition and update multiindex
105+
cond_idx = [((df["gfp_int"] > threshold).groupby("cell").mean() > 0.5).astype(int)]
106+
df["cond"] = np.broadcast_to(np.atleast_2d(np.array(conditions)).T, (2, 13))[
107+
tuple(cond_idx)
108+
].ravel()
109+
# df["cond"] = np.array([[condition] * 13, ["N"] * 13])[tuple(cond_idx)].ravel()
110+
df.set_index("cond", append=True, inplace=True)
111+
df = df.reorder_levels(
112+
["cond", "cell", "sub-cell"],
113+
)
114+
df["well"] = well_number
115+
return df, images
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
from __future__ import annotations
2+
3+
import numpy as np
4+
import pandas as pd
5+
from scipy.signal import find_peaks
6+
7+
__all__ = ["find_cosmic_rays", "remove_cosmic_rays", "group_spectra_points"]
8+
9+
10+
def find_cosmic_rays(
11+
spectra: np.ndarray, ignore_region: tuple[int, int] = (200, 400), **kwargs
12+
) -> np.ndarray:
13+
"""
14+
Find the indices of cosmic rays.
15+
16+
Parameters
17+
----------
18+
spectra : (N, 1340)
19+
The arraylike of spectr to search through
20+
ignore_region : (int, int)
21+
A region to not worry about cosmic rays in.
22+
**kwargs :
23+
Passed to scipy.signal.find_peaks
24+
25+
Returns
26+
-------
27+
idx : (M, 2) np.ndarray
28+
The indices of which spectra+pixels have detected cosmic rays.
29+
The first column contains which spectra, the second which pixel.
30+
"""
31+
spectra = np.atleast_2d(spectra)
32+
idx = []
33+
min_ignore = min(ignore_region)
34+
max_ignore = max(ignore_region)
35+
threshold = kwargs.pop("threshold", 75)
36+
prominence = kwargs.pop("prominence", 100)
37+
for s, spec in enumerate(spectra):
38+
peaks, _ = find_peaks(
39+
spec, threshold=threshold, prominence=prominence, **kwargs
40+
)
41+
for p in peaks:
42+
if min_ignore < p < max_ignore:
43+
continue
44+
idx.append((s, p))
45+
return np.asarray(idx)
46+
47+
48+
def remove_cosmic_rays(df: pd.DataFrame, plot: bool = False, **kwargs) -> pd.DataFrame:
49+
"""
50+
Process a dataframe by removing all spectra with detected cosmic rays.
51+
52+
Parameters
53+
----------
54+
df : pd.DataFrame
55+
The dataframe with spectra components as the first 1340 columns
56+
plot : bool
57+
Whether to generate a plot showing which spectra were removed.
58+
**kwargs
59+
Passed to scipy.signal.find_peaks
60+
61+
Returns
62+
-------
63+
pd.DataFrame
64+
The spectra dataframe with spectra with cosmic rays removed.
65+
"""
66+
spectra = df.iloc[:, :1340].values
67+
68+
cosmic_idx = find_cosmic_rays(spectra, **kwargs)
69+
keep_idx = np.setdiff1d(np.arange(spectra.shape[0]), cosmic_idx)
70+
if plot:
71+
import matplotlib.pyplot as plt
72+
73+
unique_cosmic, offset_idx = np.unique(cosmic_idx[:, 0], return_inverse=True)
74+
offsets = np.arange(unique_cosmic.shape[0]) * 100
75+
fig, axs = plt.subplots(1, 2, figsize=(10, 6))
76+
axs[0].set_title("Post Removal")
77+
axs[0].plot(spectra[keep_idx].T, alpha=0.75)
78+
79+
axs[1].plot(spectra[unique_cosmic].T + offsets)
80+
axs[1].plot(
81+
cosmic_idx[:, 1],
82+
spectra[cosmic_idx[:, 0], cosmic_idx[:, 1]] + offsets[offset_idx],
83+
"rx",
84+
markersize=10,
85+
label="detected cosmic rays",
86+
mew=5,
87+
)
88+
axs[1].legend()
89+
90+
axs[1].set_title("Post removal - with offset")
91+
return df.iloc[keep_idx]
92+
93+
94+
def group_spectra_points(df, multiplier: int) -> pd.DataFrame:
95+
"""
96+
Add which point each spectra is from to the multiindex.
97+
98+
Parameters
99+
----------
100+
df : pd.DataFrame
101+
The raman dataframe. Should be organized as T, P, type
102+
multiplier : int
103+
How many subspectra per point
104+
105+
Returns
106+
-------
107+
pd.DataFrame
108+
the original dataframe with pt_label appended to multiindex
109+
"""
110+
offset = 0
111+
for pos, pos_df in df.groupby(level=1):
112+
for t, tp_df in pos_df.groupby(level=0):
113+
n_pts = int(len(tp_df) / multiplier)
114+
pt_labels = (
115+
np.broadcast_to(
116+
np.arange(n_pts, dtype=int)[:, None], (n_pts, multiplier)
117+
).ravel()
118+
+ offset
119+
)
120+
df.loc[(t, pos), "pt"] = pt_labels
121+
offset = df.loc[pos, "pt"].max()
122+
df["pt"] = df["pt"].astype(int)
123+
return df.set_index("pt", append=True)

0 commit comments

Comments
 (0)