add loading and preprocessing stuff

ianhi · ianhi · commit 456fbf028896 · 2023-07-25T12:32:54.000-04:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,9 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
 ]
 dynamic = ["version"]
-dependencies = []
+dependencies = [
+    "scipy",
+]
 
 # extras
 # https://peps.python.org/pep-0621/#dependencies-optional-dependencies
diff --git a/src/raman_analysis/loading.py b/src/raman_analysis/loading.py
@@ -0,0 +1,115 @@
+from typing import Iterable, Union
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+
+def ds2df(ds: xr.Dataset, fov: int, cell_index_start: int = 0, filename=None) -> pd.DataFrame:
+    """
+    Convert a single dataset into a dataframe.
+
+    Parameters
+    ----------
+    ds : xr.Dataset
+        The dataset to convert
+    fov : int
+        The fov index.
+    cell_index_start : int
+        The offset of the cell index. This is necessary
+        if you want to easily combine multiple datasets into a
+        single dataframe down the road
+    filename : str, optional
+        The filename to add to an 'fname' column.
+
+    Returns
+    -------
+    df : dataframe
+    """
+    pts_cell = ds.pts_cell
+    mult = ds.attrs["multiplier"]
+    indx = pd.MultiIndex.from_product(
+        (np.arange(int(pts_cell.shape[0] / mult)) + cell_index_start, np.arange(mult)),
+        names=("cell", "sub-cell"),
+    )
+    df = pd.DataFrame(ds["cell_raman"][0], index=indx)
+    df["x"] = ds["cell_points"][:, 0]
+    df["y"] = ds["cell_points"][:, 1]
+
+    cell_points = ds["cell_points"] * 2048
+    bkd_points = ds["bkd_points"] * 2048
+    cell_raman = ds["cell_raman"] - 608
+    bkd_raman = ds["bkd_raman"] - 608
+
+    thres = 140
+
+    cell_com = cell_points.to_numpy().astype(int)
+    gfp_int = np.asarray([ds["img"][1, 0, x[0], x[1]].values for x in cell_com])
+
+    df["gfp_int"] = gfp_int
+    df["fov"] = fov
+    if filename is not None:
+        df["fname"] = filename
+    return df
+
+
+
+
+def glob2df(
+    files: Union[Iterable[str], str],
+    conditions,
+    threshold: float,
+    well_number: int = 0,
+    cell_index_start: int = 0,
+    verbose: bool = True,
+) -> pd.DataFrame:
+    """
+    Parameters
+    ----------
+    files : iterable, str
+        An iterable of filenames, or a glob string to use to find the files.
+    conditions : tuple(str, str)
+        The two conditions in this dataset, should be listed
+        in order of (undyed condition, dyed condition)
+    threshold : float
+        The threshold gfp value for determining dyed vs undyed cell
+    well_number : int, default 0
+        What well number to put in the dataframe
+    cell_index_start : int
+        The offset of the cell index. This is necessary
+        if you want to easily combine multiple datasets into a
+        single dataframe down the road
+    verbose : bool, default True
+        Whether to print out the file names as they are opened.
+
+    Returns
+    -------
+    df : pd.DataFrame
+    images : xr.DataArray
+    """
+    if isinstance(files, str):
+        files = glob(files)
+    dfs = []
+    images = []
+    for fov, f in enumerate(files):
+        if verbose:
+            print(f)
+        ds = xr.open_dataset(f)
+        dfs.append(ds2df(ds, fov, cell_index_start, filename=f))
+        cell_index_start = dfs[-1].index.max()[0] + 1
+        images.append(ds["img"])
+    images = xr.concat(images, dim="fov")
+    df = pd.concat(dfs)
+
+    # determine condition and update multiindex
+    cond_idx = [((df["gfp_int"] > threshold).groupby("cell").mean() > 0.5).astype(int)]
+    df["cond"] = np.broadcast_to(np.atleast_2d(np.array(conditions)).T, (2, 13))[
+        tuple(cond_idx)
+    ].ravel()
+    # df["cond"] = np.array([[condition] * 13, ["N"] * 13])[tuple(cond_idx)].ravel()
+    df.set_index("cond", append=True, inplace=True)
+    df = df.reorder_levels(
+        ["cond", "cell", "sub-cell"],
+    )
+    df["well"] = well_number
+    return df, images
diff --git a/src/raman_analysis/preprocessing.py b/src/raman_analysis/preprocessing.py
@@ -0,0 +1,123 @@
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+from scipy.signal import find_peaks
+
+__all__ = ["find_cosmic_rays", "remove_cosmic_rays", "group_spectra_points"]
+
+
+def find_cosmic_rays(
+    spectra: np.ndarray, ignore_region: tuple[int, int] = (200, 400), **kwargs
+) -> np.ndarray:
+    """
+    Find the indices of cosmic rays.
+
+    Parameters
+    ----------
+    spectra : (N, 1340)
+        The arraylike of spectr to search through
+    ignore_region : (int, int)
+        A region to not worry about cosmic rays in.
+    **kwargs :
+        Passed to scipy.signal.find_peaks
+
+    Returns
+    -------
+    idx : (M, 2) np.ndarray
+        The indices of which spectra+pixels have detected cosmic rays.
+        The first column contains which spectra, the second which pixel.
+    """
+    spectra = np.atleast_2d(spectra)
+    idx = []
+    min_ignore = min(ignore_region)
+    max_ignore = max(ignore_region)
+    threshold = kwargs.pop("threshold", 75)
+    prominence = kwargs.pop("prominence", 100)
+    for s, spec in enumerate(spectra):
+        peaks, _ = find_peaks(
+            spec, threshold=threshold, prominence=prominence, **kwargs
+        )
+        for p in peaks:
+            if min_ignore < p < max_ignore:
+                continue
+            idx.append((s, p))
+    return np.asarray(idx)
+
+
+def remove_cosmic_rays(df: pd.DataFrame, plot: bool = False, **kwargs) -> pd.DataFrame:
+    """
+    Process a dataframe by removing all spectra with detected cosmic rays.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The dataframe with spectra components as the first 1340 columns
+    plot : bool
+        Whether to generate a plot showing which spectra were removed.
+    **kwargs
+        Passed to scipy.signal.find_peaks
+
+    Returns
+    -------
+    pd.DataFrame
+        The spectra dataframe with spectra with cosmic rays removed.
+    """
+    spectra = df.iloc[:, :1340].values
+
+    cosmic_idx = find_cosmic_rays(spectra, **kwargs)
+    keep_idx = np.setdiff1d(np.arange(spectra.shape[0]), cosmic_idx)
+    if plot:
+        import matplotlib.pyplot as plt
+
+        unique_cosmic, offset_idx = np.unique(cosmic_idx[:, 0], return_inverse=True)
+        offsets = np.arange(unique_cosmic.shape[0]) * 100
+        fig, axs = plt.subplots(1, 2, figsize=(10, 6))
+        axs[0].set_title("Post Removal")
+        axs[0].plot(spectra[keep_idx].T, alpha=0.75)
+
+        axs[1].plot(spectra[unique_cosmic].T + offsets)
+        axs[1].plot(
+            cosmic_idx[:, 1],
+            spectra[cosmic_idx[:, 0], cosmic_idx[:, 1]] + offsets[offset_idx],
+            "rx",
+            markersize=10,
+            label="detected cosmic rays",
+            mew=5,
+        )
+        axs[1].legend()
+
+        axs[1].set_title("Post removal - with offset")
+    return df.iloc[keep_idx]
+
+
+def group_spectra_points(df, multiplier: int) -> pd.DataFrame:
+    """
+    Add which point each spectra is from to the multiindex.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The raman dataframe. Should be organized as T, P, type
+    multiplier : int
+        How many subspectra per point
+
+    Returns
+    -------
+    pd.DataFrame
+        the original dataframe with pt_label appended to multiindex
+    """
+    offset = 0
+    for pos, pos_df in df.groupby(level=1):
+        for t, tp_df in pos_df.groupby(level=0):
+            n_pts = int(len(tp_df) / multiplier)
+            pt_labels = (
+                np.broadcast_to(
+                    np.arange(n_pts, dtype=int)[:, None], (n_pts, multiplier)
+                ).ravel()
+                + offset
+            )
+            df.loc[(t, pos), "pt"] = pt_labels
+        offset = df.loc[pos, "pt"].max()
+    df["pt"] = df["pt"].astype(int)
+    return df.set_index("pt", append=True)