Source code for htstools.qc

"""Functions for performing quality control checks on data."""

from typing import Iterable, Optional, Union
from functools import reduce, partial
from itertools import product

from pandas import merge, DataFrame
import numpy as np
from scipy import stats

_SUMMARY_FUNCTIONS = {
    'sd': {
        False: lambda x: x.std(),
        True: lambda x: x.agg(stats.median_abs_deviation),
    },
    'var': {
        False: lambda x: x.var(),
        True: lambda x: x.var(),
    },
    'mean': {
        False: lambda x: x.mean(),
        True: lambda x: x.median(),
    } 
}

def _var(
    df: DataFrame, 
    vartype: str = 'sd',
    robust: bool = False
):
    return _SUMMARY_FUNCTIONS[vartype][robust](df)

def _mean(
    df: DataFrame, 
    robust: bool = False
):
    return _SUMMARY_FUNCTIONS['mean'][robust](df)

def _make_grouped_mean_variance(
    df: DataFrame, 
    measurement_col: str,
    control_col: str,
    pos: str,
    neg: str,
    group: Optional[Union[str, Iterable[str]]] = None,
    vartype: str = 'sd',
    robust: bool = False
) -> DataFrame:
    if vartype not in ('sd', 'var'):
        raise ValueError(f"{vartype=} not supported.")
    
    neg_colname, pos_colname = (
        measurement_col + s for s in ('_neg', '_pos')
    )
    
    if group is None:
        group = '__group__'
        df[group] = group
        
    read_type = '_'.join(measurement_col.split('_')[1:3]) + '_wavelength'
    z_data = df[group + [read_type]].drop_duplicates()

    for q, name in zip((neg, pos), 
                       (neg_colname, pos_colname)):
        (mean_name, var_name) = (
            name + s for s in ('_mean', '_' + vartype)
        )

        grouped = (
            df
            .query(f'{control_col} == "{q}"')
            .groupby(group)[[measurement_col]]
        )

        mean_ = (
            _mean(grouped, robust)
            .reset_index(names=group)
            .rename(columns={measurement_col: mean_name})
        )
        var_ = (
            _var(grouped, vartype, robust)
            .reset_index(names=group)
            .rename(columns={measurement_col: var_name})
        )
        z_data = reduce(
            partial(merge, how='outer'), 
            (z_data, mean_, var_),
        )
        
    (neg_mean, 
     neg_var, 
     pos_mean, 
     pos_var) = (
        a + b for a, b in product(
            (neg_colname, pos_colname), 
            ('_mean', '_' + vartype),
        )
    )
    
    if group == '__group__':
        for d in (df, z_data):
            d.drop(columns=[group], inplace=True)
    return (neg_mean, neg_var, pos_mean, pos_var), z_data


[docs] def ssmd( data: DataFrame, measurement_col: str, control_col: str, pos: str, neg: str, group: Optional[Union[str, Iterable[str]]] = None, robust: bool = False ) -> DataFrame: """Calculate SSMD based on positive and negative controls, optionally within groups. Calculations are performed within groups, such as batches or plates, indicated by the `group` column. This function takes the group-wise mean and variance of positive and negative controls ($\mu_p$, $\mu_n$, $\sigma_p^2$, $\sigma_n^2$), and then within each group calculates the SSMD, $s$: $$s = \\frac{\mu_n - \mu_p}{\sqrt{\sigma_n^2 + \sigma_p^2}}$$ Parameters ---------- data : pandas.DataFrame Input dataframe. measurement_col : str Name of column containing raw data. control_col : str Name of column containing control indicators. pos : str Name of positive controls. neg : str Name of negative controls. group : str or list, optional Name of column containing the grouping variable, such as plates or batches. If not set, then entire the data is taken as one big group. robust : bool, optional Use median instead of mean (still uses variance). Default: False. Returns ------- pandas.DataFrame Summary dataframe with columns for mean, variance, and SSMD. """ ((neg_mean, neg_var, pos_mean, pos_var), z_data) = _make_grouped_mean_variance( df=data, measurement_col=measurement_col, control_col=control_col, pos=pos, neg=neg, group=group, vartype='var', robust=robust, ) ssmd_col = measurement_col + '_ssmd' z_data[ssmd_col] = ( (z_data[pos_mean] - z_data[neg_mean]) / np.sqrt(z_data[pos_var] + z_data[neg_var]) ) return z_data
[docs] def z_prime_factor(data: DataFrame, measurement_col: str, control_col: str, pos: str, neg: str, group: Optional[Union[str, Iterable[str]]] = None, robust: bool = False) -> DataFrame: """Calculate Z'-factor based on positive and negative controls, optionally within groups. Calculations are performed within groups, such as batches or plates, indicated by the `group` column. This function takes the group-wise mean and standard deviation of positive and negative controls ($\mu_p$, $\mu_n$, $\sigma_p$, $\sigma_n$), and then within each group calculates the Z'-factor, $s$: $$s = 1 - 3 \\frac{\sigma_n + \sigma_p}{\\abs(\mu_n - \mu_p)}$$ Parameters ---------- data : pandas.DataFrame Input dataframe. measurement_col : str Name of column containing raw data. control_col : str Name of column containing control indicators. pos : str Name of positive controls. neg : str Name of negative controls. group : str or list, optional Name of column containing the grouping variable, such as plates or batches. If not set, then entire the data is taken as one big group. robust : bool, optional Use median and MAD instead of mean and standard deviation. Default: False. Returns ------- pandas.DataFrame Summary dataframe with columns for mean, variance, and Z'-factor. """ ((neg_mean, neg_sd, pos_mean, pos_sd), z_data) = _make_grouped_mean_variance( df=data, measurement_col=measurement_col, control_col=control_col, pos=pos, neg=neg, group=group, vartype='sd', robust=robust, ) zprime_col = measurement_col + '_zprime' z_data[zprime_col] = 1. - ( 3. * (z_data[pos_sd] + z_data[neg_sd]) / (z_data[pos_mean] - z_data[neg_mean]).abs() ) return z_data