Source code for ngs_toolkit.chipseq

#!/usr/bin/env python


import os

import numpy as np
import pandas as pd

from ngs_toolkit import _LOGGER
from ngs_toolkit.atacseq import ATACSeqAnalysis


[docs]class ChIPSeqAnalysis(ATACSeqAnalysis):
    """
    Class to model analysis of ChIP-seq data.
    Inherits from the :class:`~ngs_toolkit.atacseq.ATACSeqAnalysis` class.

    Parameters
    ----------
    name : :obj:`str`, optional
        Name of the analysis.

        Defaults to "analysis".
    from_pep : :obj:`str`, optional
        PEP configuration file to initialize analysis from.
        The analysis will adopt as much attributes from the PEP as possible
        but keyword arguments passed at initialization will still have priority.

        Defaults to :obj:`None` (no PEP used).
    from_pickle : :obj:`str`, optional
        Pickle file of an existing serialized analysis object
        from which the analysis should be loaded.

        Defaults to :obj:`None` (will not load from pickle).
    root_dir : :obj:`str`, optional
        Base directory for the project.

        Defaults to current directory or to what is specified in PEP if :attr:`~ngs_toolkit.analysis.Analysis.from_pep`.
    data_dir : :obj:`str`, optional
        Directory containing processed data (e.g. by looper) that will
        be input to the analysis. This is in principle not required.

        Defaults to "data".
    results_dir : :obj:`str`, optional
        Directory to contain outputs produced by the analysis.

        Defaults to "results".
    prj : :class:`peppy.Project`, optional
        A :class:`peppy.Project` object that this analysis is tied to.

        Defaults to :obj:`None`.
    samples : :obj:`list`, optional
        List of :class:`peppy.Sample` objects that this analysis is tied to.

        Defaults to :obj:`None`.
    kwargs : :obj:`dict`, optional
        Additional keyword arguments will be passed to parent class :class:`~ngs_toolkit.atacseq.ATACSeqAnalysis`.
    """
    _data_type = "ChIP-seq"

    def __init__(
        self,
        name=None,
        from_pep=False,
        from_pickle=False,
        root_dir=None,
        data_dir="data",
        results_dir="results",
        prj=None,
        samples=None,
        **kwargs
    ):
        # The check for existance is to make sure other classes can inherit from this
        default_args = {
            "data_type": "ChIP-seq",
            "__data_type__": "ChIP-seq",
            "var_unit_name": "region",
            "quantity": "binding",
            "norm_units": "RPM"}
        for k, v in default_args.items():
            if not hasattr(self, k):
                setattr(self, k, v)

        super(ChIPSeqAnalysis, self).__init__(
            name=name,
            from_pep=from_pep,
            from_pickle=from_pickle,
            root_dir=root_dir,
            data_dir=data_dir,
            results_dir=results_dir,
            prj=prj,
            samples=samples,
            **kwargs
        )

[docs]    def call_peaks_from_comparisons(
        self,
        comparison_table=None,
        output_dir="{results_dir}/chipseq_peaks",
        permissive=True,
        overwrite=True,
        distributed=True,
    ):
        """
        Call peaks for ChIP-seq samples using an annotation of which samples
        belong in each comparison and which samples represent signal or background.

        Parameters
        ----------
        comparison_table : :obj:`pandas.DataFrame`
            Comparison table with the following required columns:
            "comparison_name", "sample_name", "comparison_side", "sample_group".

            Defaults to analysis' own `comparison_table`.
        output_dir : :obj:`str`
            Parent directory where peaks will be created.

            Will be created if does not exist.
        permissive: :obj:`bool`
            If incomplete/incoherent comparisons should be skipped or an error should be thrown.

            Default is :obj:`True`.
        overwrite: :obj:`bool`
            If incomplete/incoherent comparisons should be skipped or an error should be thrown.

            Default is :obj:`True`.
        distributed: :obj:`bool`
            Whether peak calling should be run in serial or in distributed mode as jobs.

            Default is :obj:`True`.

        Raises
        ----------
        ValueError
            If not `permissive` and incomplete/incoherent comparisons are detected.
        """
        import subprocess

        from ngs_toolkit.utils import (
            macs2_call_chipseq_peak,
            homer_call_chipseq_peak_job,
        )
        from tqdm import tqdm

        if comparison_table is None:
            comparison_table = self.comparison_table
        req_columns = [
            "comparison_name",
            "sample_name",
            "comparison_side",
            "sample_group",
        ]
        msg = "Comparison table is missing some of the following columns: '{}'.".format(
            ",".join(req_columns)
        )
        if not all([col in comparison_table.columns for col in req_columns]):
            _LOGGER.error(msg)
            raise AssertionError(msg)

        # Complement default `output_dir`
        if "{results_dir}" in output_dir:
            output_dir = os.path.abspath(
                output_dir.format(results_dir=self.results_dir)
            )
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # For each comparison
        comps = comparison_table["comparison_name"].drop_duplicates().sort_values()
        for comparison in tqdm(comps, total=len(comps), desc="Comparison"):
            # If there aren't two sides to each comparison, skip it or throw error
            if (
                len(
                    set(
                        comparison_table[
                            (comparison_table["comparison_name"] == comparison)
                        ]["comparison_side"].tolist()
                    )
                )
                != 2
            ):
                error = "Comparison '{}' does not contain two sides.".format(comparison)
                if permissive:
                    _LOGGER.warning(error)
                    continue
                else:
                    _LOGGER.error(error)
                    raise ValueError(error)

            # Get the sample names of samples in each side
            pos_names = comparison_table[
                (comparison_table["comparison_name"] == comparison)
                & (comparison_table["comparison_side"] == 1)
            ]["sample_name"].tolist()
            neg_names = comparison_table[
                (comparison_table["comparison_name"] == comparison)
                & (comparison_table["comparison_side"] < 1)
            ]["sample_name"].tolist()

            # Now get the actual samples
            signal_samples = [s for s in self.samples if s.name in pos_names]
            control_samples = [s for s in self.samples if s.name in neg_names]

            if (len(signal_samples) == 0) or (len(control_samples) == 0):
                error = "Comparison side for '{}' comparison does not contain samples.".format(
                    comparison
                )
                if permissive:
                    print(error)
                    continue
                else:
                    raise ValueError(error)

            print(
                "Doing comparison '{}' with positive samples '{}' and background samples '{}'".format(
                    comparison,
                    [s.name for s in signal_samples],
                    [s.name for s in control_samples],
                )
            )
            # Call peaks
            cmds = list()
            kwargs = {
                "signal_samples": signal_samples, "control_samples": control_samples,
                "output_dir": output_dir, "name": comparison, "distributed": distributed}
            if overwrite:
                cmds += [
                    macs2_call_chipseq_peak(**kwargs), homer_call_chipseq_peak_job(**kwargs)
                ]
            else:
                if not os.path.exists(
                    os.path.join(
                        output_dir, comparison, comparison + "_peaks.narrowPeak"
                    )
                ):
                    cmds += [
                        macs2_call_chipseq_peak(**kwargs)
                    ]
                if not os.path.exists(
                    os.path.join(
                        output_dir, comparison, comparison + "_homer_peaks.narrowPeak"
                    )
                ):
                    cmds += [
                        homer_call_chipseq_peak_job(**kwargs)
                    ]
                else:
                    _LOGGER.warning(
                        "Peak files for comparison '{}' already exist. Skipping.".format(
                            comparison
                        )
                    )

            if not distributed:
                for cmd in cmds:
                    _LOGGER.info(
                        "Calling peaks for comparison '{}' with command: '{}'.\n".format(
                            comparison, cmd
                        )
                    )
                    subprocess.call(cmd.split(" "))

[docs]    def filter_peaks(
        self,
        comparison_table=None,
        filter_bed=None,
        peaks_dir="{results_dir}/chipseq_peaks",
    ):
        """
        Filter peak calls for various comparisons for entries that do not overlap another BED file.

        Parameters
        ----------
        comparison_table : :obj:`pandas.DataFrame`, optional
            Comparison table with the following required columns:
            "comparison_name", "sample_name", "comparison_side", "sample_group".

            Defaults to analysis' own `comparison_table`.
        filter_bed : :obj:`str`
            BED file with entries to filter out from the BED files of each comparison.

            Defaults to the set of Blacklisted regions from the analysis' genome.
            In that case it will be fetched if not present.
        peaks_dir : :obj:`str`
            Parent directory where peak calls for each comparison exist.
            Will be created if does not exist.

            Defaults to "{results_dir}/chipseq_peaks".

        Raises
        ----------
        AttributeError
            If `filter_bed` is not given and failes to be retrieved.
        """
        import pybedtools
        from ngs_toolkit.utils import homer_peaks_to_bed

        def _filter(input_bed, filter_bed, output_bed):
            """
            Filter BED file for entries that overlap another BED file.

            Parameters
            ----------
            input_bed : :obj:`str`
                BED file to filter.

            filter_bed : :obj:`str`
                BED file with entries to filter from input_bed.

            output_bed : :obj:`str`
                Output BED file.
            """
            (
                pybedtools.BedTool(input_bed)
                .intersect(pybedtools.BedTool(filter_bed), v=True)
                .saveas(output_bed)
            )

        if comparison_table is None:
            comparison_table = self.comparison_table

        if filter_bed is None:
            _LOGGER.info("Blacklist file not provided. Downloading...")
            try:
                filter_bed = self.get_resources(steps=["blacklist"])[
                    "blacklist_file"
                ]
            except AttributeError:
                msg = "Blacklist file was not provided and cannot be"
                msg += " get one without analysis having `organism` and `genome` set."
                _LOGGER.error(msg)
                raise AttributeError(msg)

        peaks_dir = self._format_string_with_attributes(peaks_dir)
        if not os.path.exists(peaks_dir):
            os.makedirs(peaks_dir)

        for comparison in comparison_table["comparison_name"].unique():
            # MACS2
            prefix = os.path.join(peaks_dir, comparison, comparison + "_peaks")
            _filter(prefix + ".narrowPeak", filter_bed, prefix + ".filtered.bed")
            # HOMER
            prefix = os.path.join(peaks_dir, comparison, comparison + "_homer_peaks")
            homer_peaks_to_bed(prefix + ".factor.narrowPeak", prefix + ".factor.bed")
            _filter(prefix + ".factor.bed", filter_bed, prefix + ".factor.filtered.bed")
            homer_peaks_to_bed(prefix + ".histone.narrowPeak", prefix + ".histone.bed")
            _filter(
                prefix + ".histone.bed", filter_bed, prefix + ".histone.filtered.bed"
            )

[docs]    def summarize_peaks_from_comparisons(
        self,
        comparison_table=None,
        output_dir="{results_dir}/chipseq_peaks",
        filtered=True,
        permissive=True,
    ):
        """
        Call peaks for ChIP-seq samples using an annotation of which samples belong in each comparison and which
        samples represent signal or background.

        Parameters
        ----------
        comparison_table : :obj:`pandas.DataFrame`, optional
            Comparison table with the following required columns:
            "comparison_name", "sample_name", "comparison_side", "sample_group".

            Defaults to analysis' own `comparison_table`.
        output_dir : :obj:`str`
            Parent directory where peaks will be created. Will be created if does not exist.
        permissive: :obj:`bool`
            If incomplete/incoherent comparisons should be skipped or an error should be thrown.

        Raises
        ----------
        ValueError
            Will be raised if not `permissive` and incomplete/incoherent comparisons are detected.
        """
        from ngs_toolkit.utils import homer_peaks_to_bed

        if comparison_table is None:
            comparison_table = self.comparison_table

        req_columns = [
            "comparison_name",
            "sample_name",
            "comparison_side",
            "sample_group",
        ]
        msg = "Comparison table is missing some of the following columns: '{}'.".format(
            ",".join(req_columns)
        )
        if not all([col in comparison_table.columns for col in req_columns]):
            _LOGGER.error(msg)
            raise AssertionError(msg)

        # Complement default `output_dir`
        if "{results_dir}" in output_dir:
            output_dir = os.path.abspath(
                output_dir.format(results_dir=self.results_dir)
            )

        # For each comparison, count called peaks
        peak_counts = pd.DataFrame()
        for comparison in (
            comparison_table["comparison_name"].drop_duplicates().sort_values()
        ):
            _LOGGER.info(comparison)
            ending = ".filtered.bed" if filtered else ".narrowPeak"
            for peak_type, file in [
                (
                    "macs",
                    os.path.join(
                        output_dir, comparison, comparison + "_peaks" + ending
                    ),
                ),
                (
                    "homer_factor",
                    os.path.join(
                        output_dir,
                        comparison,
                        comparison + "_homer_peaks.factor" + ending,
                    ),
                ),
                (
                    "homer_histone",
                    os.path.join(
                        output_dir,
                        comparison,
                        comparison + "_homer_peaks.histone" + ending,
                    ),
                ),
            ]:
                error = "Peak files for comparison '{}' with '{}' parameters don't exist.".format(
                    comparison, peak_type
                )

                if "homer" in peak_type and not filtered:
                    try:
                        homer_peaks_to_bed(file, file.replace("narrowPeak", "bed"))
                    except IOError:
                        if permissive:
                            _LOGGER.warning(error)
                            peak_counts = peak_counts.append(
                                pd.Series([comparison, peak_type, np.nan]),
                                ignore_index=True,
                            )
                            continue
                        else:
                            raise
                    except pd.errors.EmptyDataError:
                        peak_counts = peak_counts.append(
                            pd.Series([comparison, peak_type, 0.0]), ignore_index=True
                        )

                    file = file.replace("narrowPeak", "bed")
                try:
                    df = pd.read_csv(file, sep="\t")
                except IOError:
                    if permissive:
                        _LOGGER.warning(error)
                        peak_counts = peak_counts.append(
                            pd.Series([comparison, peak_type, np.nan]),
                            ignore_index=True,
                        )
                        continue
                    else:
                        raise
                except pd.errors.EmptyDataError:
                    peak_counts = peak_counts.append(
                        pd.Series([comparison, peak_type, 0.0]), ignore_index=True
                    )

                peak_counts = peak_counts.append(
                    pd.Series([comparison, peak_type, df.shape[0]]), ignore_index=True
                )
        peak_counts.columns = ["comparison_name", "peak_type", "peak_counts"]

        return peak_counts  # .fillna(0)

[docs]    def get_consensus_sites(
            self,
            samples=None,
            region_type="summits",
            extension=250,
            blacklist_bed=None,
            filter_chroms=True,
            permissive=False,
            save=True,
            assign=True,
            **kwargs):
        """
        Get consensus (union) of enriched sites (peaks) across all comparisons.
        There are two modes possible, defined by the value of ``region_type``:

         * peaks: simple union of all sites;
         * summits: peak summits are extended by ``extension`` and a union is made.

        For ChIP-seq, the ``comparison_table`` keyword argument or a
        ``comparison_table`` attribute set is required. Peaks/summits will be
        aggregated for the peaks called in each sample comparison.

        Parameters
        ----------
        samples : :obj:`list`
            Iterable of :class:`peppy.Sample` objects to restrict to.
            Must have a ``peaks`` attribute set.

            Defaults to all samples in the analysis (``samples`` attribute).
        region_type : :obj:`str`
            The type of region to use to create the consensus region set
            - one of "summits" or "peaks".
            If "summits", peak summits will be extended by ``extension``
            before union.
            If "peaks", sample peaks will be used with no modification prior to
            union.

            Default is "summits".
        extension : :obj:`int`
            Amount to extend peaks summits by in both directions.

            Default is 250.
        blacklist_bed : {:obj:`False`, :obj:`str`}
            Either :obj:`False` or a path to a BED file with genomic positions
            to exclude from consensus peak set.

            Default is to use a blacklist file for the analysis ``genome``.
        filter_chroms : {:obj:`list`, :obj:`str`}
            A list of chromosomes to filter out or
            a string with a pattern to match to exclude chromosomes.
            Uses Pandas string methods :class:`pandas.Series.str.match`.
            Pass for example `'.*_.*|chrM'` to filter out chromosomes with a "_"
            character and a "chrM" chromosome.

            Default is not to filter anything.
        permissive : :obj:`bool`
            Whether Samples that which ``region_type`` attribute file
            does not exist should be simply skipped or an error thrown.

        comparison_table : :obj:`pandas.DataFrame`, optional
            DataFrame with signal/background combinations used to call peaks.
            Part of kwargs.

            Defaults to analysis own ``comparison_table``.
        peak_dir : :obj:`str`, optional
            Path to peaks output directory. Part of kwargs.

            Defaults to "{analysis.results_dir}/chipseq_peaks".

        Attributes
        ----------
        sites : :class:`pybedtools.BedTool`
            Bedtool with consensus sites.
        """
        import re
        from ngs_toolkit.general import get_blacklist_annotations
        import pybedtools
        from tqdm import tqdm
        import tempfile

        if "comparison_table" not in kwargs:
            comparison_table = self.comparison_table
        else:
            comparison_table = kwargs["comparison_table"]

        if "peak_dir" not in kwargs:
            peak_dir = os.path.join(self.results_dir, "chipseq_peaks")

        if region_type not in ["summits", "peaks"]:
            msg = "`region_type` attribute must be one of 'summits' or 'peaks'!"
            _LOGGER.error(msg)
            raise ValueError(msg)

        if blacklist_bed is None:
            _LOGGER.info("Blacklist file not provided. Downloading...")
            try:
                blacklist_bed = get_blacklist_annotations(self.organism, self.genome)
            except AttributeError:
                msg = "Blacklist file was not provided and cannot"
                msg += " get one without analysis having `organism` and `genome` set."
                _LOGGER.error(msg)
                raise AttributeError(msg)

        comps = comparison_table["comparison_name"].drop_duplicates()

        f = tempfile.NamedTemporaryFile()
        with open(f.name, "a") as handle:
            for comparison in tqdm(comps, total=len(comps), desc="Comparison"):
                peak_files = [
                    os.path.join(peak_dir, comparison, comparison + "_peaks.narrowPeak"),
                    os.path.join(
                        peak_dir, comparison, comparison + "_homer_peaks.factor.bed"
                    ),
                    os.path.join(
                        peak_dir, comparison, comparison + "_homer_peaks.histone.bed"
                    ),
                ]
                for peak_file in peak_files:
                    genome = (
                        comparison_table.loc[
                            comparison_table["comparison_name"] == comparison,
                            "comparison_genome",
                        ]
                        .drop_duplicates()
                        .squeeze()
                    )

                    msg = "Could not determine genome of comparison '{}'.".format(
                        comparison
                    )
                    if not isinstance(genome, str):
                        _LOGGER.error(msg)
                        raise AssertionError(msg)

                    # Get peak file
                    try:
                        f = re.sub("_peaks.narrowPeak", "_summits.bed", peak_file)
                        file = (
                            pybedtools.BedTool(f)
                            .slop(b=extension, genome=genome).fn
                            if region_type == "summits"
                            else peak_file)
                    except (ValueError, FileNotFoundError):
                        _LOGGER.warning(
                            "Input file for comparison {} ({}) not found!"
                            .format(comparison, f))
                        if not permissive:
                            raise

                    for line in open(file, 'r'):
                        handle.write(line)

        # Merge overlaping peaks across comparisons
        sites = pybedtools.BedTool(f.name).sort().merge()

        # Filter
        # # remove blacklist regions
        if blacklist_bed is not False:
            if not isinstance(blacklist_bed, pybedtools.BedTool):
                blacklist = pybedtools.BedTool(blacklist_bed)
            sites = sites.intersect(v=True, b=blacklist)

        # # filter requested chromosomes
        if filter_chroms is not None:
            if isinstance(filter_chroms, list):
                sites = sites.filter(lambda x: x.chrom not in filter_chroms).saveas()
            elif isinstance(filter_chroms, str):
                s = sites.to_dataframe()
                sites = pybedtools.BedTool.from_dataframe(s.loc[~s['chrom'].str.match(filter_chroms)])

        # Save and assign
        if save:
            output_file = os.path.join(self.results_dir, self.name + ".peak_set.bed")
            sites.saveas(output_file)
            sites = pybedtools.BedTool(output_file)
        if assign:
            self.sites = sites
        return sites

[docs]    def calculate_peak_support(
            self, samples=None, region_type="summits", permissive=False,
            comparison_table=None, peak_dir="{results_dir}/chipseq_peaks"):
        """
        Calculate a measure of support for each region in peak set
        (i.e. ratio of samples containing a peak overlapping region in union set of peaks).

        Parameters
        ----------
        comparison_table : :obj:`pandas.DataFrame`, optional
            DataFrame with signal/background combinations used to call peaks

            Defaults to analysis' own `comparison_table`.
        peak_dir : :obj:`str`, optional
            Path to peaks output directory.
            Defaults to {analysis.results_dir}/chipseq_peaks
        samples: :obj:`list`
            Not used. Provided for compatibility with ATACSeqAnalysis class.
        region_type: :obj:`str`
            Not used. Provided for compatibility with ATACSeqAnalysis class.
        permissive: :obj:`bool`
            Not used. Provided for compatibility with ATACSeqAnalysis class.

        Attributes
        ----------
        support : :obj:`pandas.DataFrame`
            DataFrame with signal/background combinations used to call peaks
        """
        import pybedtools
        from tqdm import tqdm
        from ngs_toolkit.utils import bed_to_index

        if comparison_table is None:
            comparison_table = self.comparison_table

        peak_dir = os.path.abspath(self._format_string_with_attributes(peak_dir))

        # get index
        index = bed_to_index(self.sites.to_dataframe())

        # calculate support (number of samples overlaping each merged peak)
        comps = comparison_table["comparison_name"].drop_duplicates()
        support = pd.DataFrame(index=index)
        for comparison in tqdm(comps, total=comps.shape[0], desc="Comparison"):
            peak_files = [
                (
                    "MACS",
                    os.path.join(
                        peak_dir, comparison, comparison + "_peaks.narrowPeak"
                    ),
                ),
                (
                    "HOMER_factor",
                    os.path.join(
                        peak_dir, comparison, comparison + "_homer_peaks.factor.bed"
                    ),
                ),
                (
                    "HOMER_histone",
                    os.path.join(
                        peak_dir, comparison, comparison + "_homer_peaks.histone.bed"
                    ),
                ),
            ]
            for peak_type, peak_file in peak_files:
                try:
                    sample_support = self.sites.intersect(
                        peak_file, wa=True, c=True
                    ).to_dataframe()
                except (
                    ValueError,
                    pybedtools.MalformedBedLineError,
                    pybedtools.helpers.BEDToolsError,
                ):
                    _LOGGER.warning(
                        "Peaks for comparison {} ({}) not found!".format(
                            comparison, peak_file
                        )
                    )
                    continue
                sample_support.index = index
                support[(comparison, peak_type)] = sample_support.iloc[:, 3]

        # Make multiindex labeling comparisons and peak type
        support.columns = pd.MultiIndex.from_tuples(
            support.columns, names=["comparison", "peak_type"]
        )
        support.to_csv(
            os.path.join(
                self.results_dir, self.name + "_peaks.binary_overlap_support.csv"
            ),
            index=True,
        )

        # divide sum (of unique overlaps) by total to get support value between 0 and 1
        support["support"] = support.astype(bool).astype(int).sum(axis=1) / float(
            support.shape[1]
        )

        # save
        support.to_csv(
            os.path.join(self.results_dir, self.name + "_peaks.support.csv"), index=True
        )

        self.support = support

[docs]    def get_supported_peaks(self, samples=None, **kwargs):
        """
        Get mask of sites with 0 support in the given samples.
        Requires support matrix produced by `ngs_toolkit.atacseq.ATACSeqAnalysis.calculate_peak_support`.

        Parameters
        ----------
        samples : :obj:`list`
            Not used. Provided for compatibility with ATACSeqAnalysis class.

        comparisons : :obj:`list`
            Iterable of comparison names to restrict to.
            Must match name of comparisons used in comparison_table.

        Returns
        -------
        pd.Series
            Boolean Pandas Series with sites with at least one of the given samples having a peak called.
        """
        if "comparisons" not in kwargs:
            msg = "Requires a keyword argument `comparisons`."
            _LOGGER.error(msg)
            raise ValueError(msg)
        return self.support[[c for c in kwargs["comparisons"]]].sum(1) != 0

[docs]    def normalize_by_background(
            self,
            comparison_table=None,
            reduction_func=np.mean,
            comparison_func=np.subtract,
            by_group=False,
            matrix="matrix_norm",
            samples=None):
        """
        Normalize values in matrix by background samples in a comparison-specific way as specified in `comparison_table`.

        The background samples will be pooled by the `reduction_func` and their values wil be removed
        from the signal samples using the `comparison_func`.

        Parameters
        ----------
        comparison_table : :class:`pandas.DataFrame`
            Table with comparisons from which peaks were called.

            Defaults to analysis' `comparison_table`.
        reduction_func : func
            Function to reduce the region to gene values to.

            Defaults to :obj:`numpy.mean`.
        comparison_func : func
            Function to use for normalization of signal samples against background samples.
            You can also try for example :obj:`numpy.divide`.

            Defaults to :obj:`numpy.subtract`.
        by_group : :obj:`bool`
            Whether output should be by group (:obj:`True`) or for each sample (:obj:`False`).

            Default is :obj:`False`.
        matrix : {:class:`pandas.DataFrame`, :obj:`str`, optional}
            Name of attribute or pandas DataFrame to use.

            Defaults to "matrix_norm".
        samples : :obj:`list`, optional
            Subset of samples to consider.

            Defaults to all samples in analysis.

        Returns
        -------
        :class:`pandas.DataFrame`
            Dataframe with values normalized by background samples.
        """
        if comparison_table is None:
            comparison_table = self.comparison_table
        if samples is None:
            samples = self.samples
        matrix = self.get_matrix(matrix, samples=samples)

        comparisons = set(comparison_table['comparison_name'])
        s_names = [s.name for s in samples]
        if not by_group:
            res = pd.DataFrame(index=matrix.index)
            for comparison in comparisons:
                comp = comparison_table.query("comparison_name == '{}'".format(comparison))
                signal_samples = [s for s in comp.query("comparison_side == 1")['sample_name'] if s in s_names]
                background_samples = [s for s in comp.query("comparison_side == 0")['sample_name'] if s in s_names]

                for s in signal_samples:
                    res.loc[:, s] = comparison_func(
                        matrix.loc[:, s],
                        matrix.loc[:, background_samples].apply(reduction_func, axis=1)
                        if reduction_func != np.mean
                        else matrix.loc[:, background_samples].mean(axis=1))
        if by_group:
            comparisons = set(comparison_table['comparison_name'])
            s_names = [s.name for s in samples]
            res = pd.DataFrame(index=matrix.index, columns=comparisons)
            for comparison in comparisons:
                comp = comparison_table.query("comparison_name == '{}'".format(comparison))
                signal_samples = [s for s in comp.query("comparison_side == 1")['sample_name'] if s in s_names]
                background_samples = [s for s in comp.query("comparison_side == 0")['sample_name'] if s in s_names]

                res.loc[:, comparison] = comparison_func(
                    matrix.loc[:, signal_samples].apply(reduction_func, axis=1)
                    if reduction_func != np.mean
                    else matrix.loc[:, signal_samples].mean(axis=1),
                    matrix.loc[:, background_samples].apply(reduction_func, axis=1)
                    if reduction_func != np.mean
                    else matrix.loc[:, background_samples].mean(axis=1))
        return res