Source code for ngs_toolkit.atacseq

#!/usr/bin/env python


import os

import numpy as np
import pandas as pd

from ngs_toolkit import _LOGGER
from ngs_toolkit.analysis import Analysis
from ngs_toolkit.decorators import check_has_attributes


[docs]class ATACSeqAnalysis(Analysis):
    """
    Class to model analysis of ATAC-seq data.
    Inherits from the :class:`~ngs_toolkit.analysis.Analysis` class.

    Parameters
    ----------
    name : :obj:`str`, optional
        Name of the analysis.

        Defaults to "analysis".
    from_pep : :obj:`str`, optional
        PEP configuration file to initialize analysis from.
        The analysis will adopt as much attributes from the PEP as possible
        but keyword arguments passed at initialization will still have priority.

        Defaults to :obj:`None` (no PEP used).
    from_pickle : :obj:`str`, optional
        Pickle file of an existing serialized analysis object
        from which the analysis should be loaded.

        Defaults to :obj:`None` (will not load from pickle).
    root_dir : :obj:`str`, optional
        Base directory for the project.

        Defaults to current directory or to what is specified in PEP if :attr:`~ngs_toolkit.analysis.Analysis.from_pep`.
    data_dir : :obj:`str`, optional
        Directory containing processed data (e.g. by looper) that will
        be input to the analysis. This is in principle not required.

        Defaults to "data".
    results_dir : :obj:`str`, optional
        Directory to contain outputs produced by the analysis.

        Defaults to "results".
    prj : :class:`peppy.Project`, optional
        A :class:`peppy.Project` object that this analysis is tied to.

        Defaults to :obj:`None`.
    samples : :obj:`list`, optional
        List of :class:`peppy.Sample` objects that this analysis is tied to.

        Defaults to :obj:`None`.
    kwargs : :obj:`dict`, optional
        Additional keyword arguments will be passed to parent class :class:`~ngs_toolkit.analysis.Analysis`.

    Examples
    --------
    >>> from ngs_toolkit.atacseq import ATACSeqAnalysis

    This is an example of the beginning of an ATAC-seq analysis:

    >>> pep = "metadata/project_config.yaml"
    >>> a = ATACSeqAnalysis(from_pep=pep)
    >>> # Get consensus peak set from all samples
    >>> a.get_consensus_sites(a.samples)
    >>> # Annotate regions
    >>> a.get_peak_gene_annotation()
    >>> a.get_peak_genomic_location()
    >>> # Get coverage values for each peak in each sample of ATAC-seq
    >>> a.measure_coverage()
    >>> # Normalize jointly (quantile normalization + GC correction)
    >>> a.normalize(method="gc_content")
    >>> # Annotate quantified peaks with previously calculated metrics and features
    >>> a.annotate_features()
    >>> # Annotate with sample metadata
    >>> a.annotate_samples()
    >>> # Save object
    >>> a.to_pickle()
    """
    _data_type = "ATAC-seq"

    def __init__(
        self,
        name=None,
        from_pep=False,
        from_pickle=False,
        root_dir=None,
        data_dir="data",
        results_dir="results",
        prj=None,
        samples=None,
        **kwargs
    ):
        # The check for existance is to make sure other classes can inherit from this
        default_args = {
            "data_type": "ATAC-seq",
            "__data_type__": "ATAC-seq",
            "var_unit_name": "region",
            "quantity": "accessibility",
            "norm_units": "RPM"}
        for k, v in default_args.items():
            if not hasattr(self, k):
                setattr(self, k, v)

        super(ATACSeqAnalysis, self).__init__(
            name=name,
            from_pep=from_pep,
            from_pickle=from_pickle,
            root_dir=root_dir,
            data_dir=data_dir,
            results_dir=results_dir,
            prj=prj,
            samples=samples,
            **kwargs
        )

[docs]    def load_data(
        self,
        output_map=None,
        only_these_keys=None,
        prefix="{results_dir}/{name}",
        permissive=True,
    ):
        """
        Load the output files of the major functions of the Analysis.

        Parameters
        ----------
        output_map : :obj:`dict`
            Dictionary with "attribute name": "path prefix" to load the files.

        only_these_keys : :obj:`list`, optional
            Iterable of analysis attributes to load up.
            Possible attributes:

                * "matrix_raw"
                * "matrix_norm"
                * "matrix_features"
                * "sites"
                * "support"
                * "nuc"
                * "coverage_gc_corrected"
                * "gene_annotation"
                * "region_annotation"
                * "region_annotation_b"
                * "chrom_state_annotation"
                * "chrom_state_annotation_b"
                * "stats"
                * "differential_results"

            Default is all of the above.
        prefix : :obj:`str`, optional
            String prefix of files to load.
            Variables in curly braces will be formated with attributes of analysis.

            Defaults to "{results_dir}/{name}".
        bool : permissive, optional
            Whether an error should be ignored if reading a file causes IOError.

            Default is :obj:`True`.

        Attributes
        ----------
        sites : :class:`pybedtools.bedtool.BedTool`
            Sets a `sites` variable.

        pandas.DataFrame
            Dataframes holding the respective data, available as attributes described
            in the `only_these_keys` parameter.

        Raises
        ----------
        IOError
            If not permissive and a file is not found
        """
        import pybedtools

        prefix = self._format_string_with_attributes(prefix)

        if output_map is None:
            kwargs = {"index_col": 0}
            output_map = {
                "sites": (prefix + ".peak_set.bed", {}),
                "matrix_raw": (prefix + ".matrix_raw.csv", kwargs),
                "matrix_norm": (prefix + ".matrix_norm.csv", kwargs),
                "matrix_features": (prefix + ".matrix_features.csv", kwargs),
                "support": (prefix + ".support.csv", kwargs),
                "nuc": (prefix + ".gccontent_length.csv", kwargs),
                "gene_annotation": (prefix + ".gene_annotation.csv", kwargs),
                "closest_tss_distances": (
                    prefix + ".closest_tss_distances.csv",
                    kwargs,
                ),
                "region_annotation": (prefix + ".region_annotation.csv", kwargs),
                "region_annotation_b": (
                    prefix + ".region_annotation_background.csv",
                    kwargs,
                ),
                "region_annotation_mapping": (
                    prefix + ".region_annotation_mapping.csv",
                    kwargs,
                ),
                "region_annotation_b_mapping": (
                    prefix + ".region_annotation_background_mapping.csv",
                    kwargs,
                ),
                "chrom_state_annotation": (
                    prefix + ".chrom_state_annotation.csv",
                    kwargs,
                ),
                "chrom_state_annotation_b": (
                    prefix + ".chrom_state_annotation_background.csv",
                    kwargs,
                ),
                "chrom_state_annotation_mapping": (
                    prefix + ".chrom_state_annotation_mapping.csv",
                    kwargs,
                ),
                "chrom_state_annotation_b_mapping": (
                    prefix + ".chrom_state_annotation_background_mapping.csv",
                    kwargs,
                ),
                "stats": (prefix + ".stats_per_feature.csv", kwargs),
                "differential_results": (
                    os.path.join(
                        self.results_dir,
                        "differential_analysis_{}".format(self.data_type),
                        "differential_analysis.deseq_result.all_comparisons.csv",
                    ),
                    kwargs,
                ),
            }

        if only_these_keys is None:
            only_these_keys = list(output_map.keys())

        # Use the parent method just with an updated output_map dictionary
        Analysis.load_data(
            self,
            output_map={k: v for k, v in output_map.items() if k != "sites"},
            only_these_keys=only_these_keys,
            prefix=prefix,
            permissive=permissive,
        )

        # Special case
        if "sites" in only_these_keys:
            _LOGGER.info("Loading 'sites' analysis attribute.")
            file = os.path.join(self.results_dir, self.name + ".peak_set.bed")
            try:
                setattr(self, "sites", pybedtools.BedTool(file))
            except IOError as e:
                if not permissive:
                    raise e
                else:
                    _LOGGER.warning(e)

    @staticmethod
    def check_region_index(matrix):
        return (
            matrix.index.str.contains(":").all()
            and matrix.index.str.contains("-").all()
        )

    @staticmethod
    def set_region_index(matrix, force=False):
        from ngs_toolkit.utils import bed_to_index

        if (not force) and ATACSeqAnalysis.check_region_index(matrix):
            _LOGGER.warning("Matrix already has well-formatted index.")
            return matrix
        else:
            req = ["chrom", "start", "end"]
            if all([x in matrix.columns for x in req]):
                matrix.index = bed_to_index(matrix)
            else:
                raise ValueError("Could not format matrix index. Missing '{}' columns.".format(",".join(req)))

[docs]    def get_consensus_sites(
        self,
        samples=None,
        region_type="summits",
        extension=250,
        blacklist_bed=None,
        filter_chroms=None,
        permissive=False,
        save=True,
        assign=True,
        **kwargs
    ):
        """
        Get consensus (union) of enriched sites (peaks) across samples.
        There are two modes possible, defined by the value of ``region_type``:

         * peaks: simple union of all sites;
         * summits: peak summits are extended by ``extension`` and a union is made.

        Parameters
        ----------
        samples : :obj:`list`
            Iterable of :class:`peppy.Sample` objects to restrict to.
            Must have a ``peaks`` attribute set.

            Defaults to all samples in the analysis (``samples`` attribute).
        region_type : :obj:`str`
            The type of region to use to create the consensus region set
            - one of "summits" or "peaks".
            If "summits", peak summits will be extended by ``extension``
            before union.
            If "peaks", sample peaks will be used with no modification prior to
            union.

            Default is "summits".
        extension : :obj:`int`
            Amount to extend peaks summits by in both directions.

            Default is 250.
        blacklist_bed : {:obj:`False`, :obj:`str`}
            Either :obj:`False` or a path to a BED file with genomic positions
            to exclude from consensus peak set.

            Default is to use a blacklist file for the analysis ``genome``.
        filter_chroms : {:obj:`list`, :obj:`str`}
            A list of chromosomes to filter out or
            a string with a pattern to match to exclude chromosomes.
            Uses Pandas string methods :class:`pandas.Series.str.match`.
            Pass for example `".*_.*|chrM"` to filter out chromosomes with a "_"
            character and a "chrM" chromosome.

            Default is not to filter anything.
        permissive : :obj:`bool`
            Whether Samples that which ``region_type`` attribute file does not exist
            should be simply skipped or an error thrown.

            Default is :obj:`True`.
        **kwargs
            Not used. Provided for compatibility with :class:`ngs_toolkit.ChIPSeqAnalysis` class.

        Raises
        ----------
        :obj:`ValueError`
            If not ``permissive`` and either the ``peaks`` or ``summits`` file
            of a sample is not readable, or if ``permissive``
            but none of the samples has an existing file.
        :obj:`AttributeError`
            If analysis does not have ``organism`` and ``genome`` attributes.

        Attributes
        ----------
        sites : :class:`pybedtools.bedtool.BedTool`
            Sets a ``sites`` variable with the consensus peak set.

        Returns
        ----------
        sites : :class:`pybedtools.bedtool.BedTool`
            The consensus peak set.
        """
        from tqdm import tqdm
        import pybedtools
        import tempfile

        if region_type not in ["summits", "peaks"]:
            msg = "`region_type` attribute must be one of 'summits' or 'peaks'!"
            _LOGGER.error(msg)
            raise ValueError(msg)

        if samples is None:
            samples = self.samples

        # Check which samples to run (dependent on permissive)
        samples = self._get_samples_with_input_file(
            region_type, permissive=permissive, samples=samples
        )

        if (blacklist_bed is not False and blacklist_bed is None):
            _LOGGER.info("Blacklist file not provided. Downloading...")
            try:
                blacklist_bed = self.get_resources(steps=["blacklist"])[
                    "blacklist_file"
                ]
            except AttributeError:
                msg = "Blacklist file was not provided and cannot be"
                msg += " get one without `organism` and `genome` set."
                _LOGGER.error(msg)
                raise AttributeError(msg)

        f = tempfile.NamedTemporaryFile()
        with open(f.name, "a") as handle:
            for sample in tqdm(samples, total=len(samples), desc="Sample"):
                try:
                    file = pybedtools.BedTool(sample.summits) \
                        .slop(b=extension, genome=sample.genome).fn \
                        if region_type == "summits" else sample.peaks
                except (ValueError, FileNotFoundError):
                    if not permissive:
                        raise
                    else:
                        _LOGGER.warning(
                            "Peaks for sample {} ({}) not found!"
                            .format(sample, sample.peaks))
                        continue
                for line in open(file, 'r'):
                    handle.write(line)

        # NCBI genome FASTA files are sorted naturally while Ensembl are not
        # depending on that you might want to sort the resulting BED file
        # accordingly with the following:
        #     sites = sort_bed(f.name).merge()
        sites = pybedtools.BedTool(f.name).sort().merge()

        # Filter
        # # remove blacklist regions
        if blacklist_bed is not False:
            if not isinstance(blacklist_bed, pybedtools.BedTool):
                blacklist = pybedtools.BedTool(blacklist_bed)
            sites = sites.intersect(v=True, b=blacklist)

        # # filter requested chromosomes
        if filter_chroms is not None:
            if isinstance(filter_chroms, list):
                sites = sites.filter(lambda x: x.chrom not in filter_chroms).saveas()
            elif isinstance(filter_chroms, str):
                s = sites.to_dataframe()
                sites = pybedtools.BedTool.from_dataframe(s.loc[~s['chrom'].str.match(filter_chroms)])

        # Save and assign
        if save:
            output_file = os.path.join(self.results_dir, self.name + ".peak_set.bed")
            sites.saveas(output_file)
            self.record_output_file(output_file, "consensus_sites")
            sites = pybedtools.BedTool(output_file)
        if assign:
            self.sites = sites
        return sites

[docs]    def set_consensus_sites(self, bed_file, overwrite=True):
        """
        Set consensus (union) sites across samples given a BED file.

        Parameters
        ----------
        bed_file : :obj:`str`
            BED file to use as consensus sites.

        overwrite : :obj:`bool`
            Whether a possibly existing file with a consensus peak set
            for this analysis should be overwritten in disk.

        Attributes
        ----------
        sites : :class:`~pybedtools.BedTool`
            Sets a `sites` variable with consensus peak set.
        """
        import pybedtools

        self.sites = pybedtools.BedTool(bed_file)
        if overwrite:
            default_sites = os.path.join(self.results_dir, self.name + ".peak_set.bed")
            # pybedtools will pipe to the input file!
            if bed_file == default_sites:
                self.sites.saveas(default_sites + ".new")
                os.rename(default_sites + ".new", default_sites)
            else:
                self.sites.saveas(default_sites)
        # TODO: warn if not overwrite and file exists already

[docs]    @check_has_attributes(['sites'])
    def calculate_peak_support(
        self, samples=None, region_type="summits", permissive=False,
        comparison_table=None, peak_dir=None
    ):
        """
        Count number of called peaks per sample in the consensus region set.
        In addition calculate a measure of peak support (or ubiquitouness) by
        observing the ratio of samples containing a peak overlapping each region.

        Parameters
        ----------
        samples : :obj:`list`
            Iterable of :class:`peppy.Sample` objects to restrict to.
            Must have a ``peaks`` attribute set.

            Defaults to all samples in the analysis (``samples`` attribute).
        region_type : :obj:`str`
            The type of region to use to create the consensus region set.
            One of "summits" or "peaks".
            If `summits`, peak summits will be extended by ``extension`` before union.
            Otherwise sample peaks will be used with no modification.

            Default is "summits".
        permissive: :obj:`bool`
            Whether Samples that which `region_type` attribute file does
            not exist should be simply skipped or an error thrown.

        comparison_table: :obj:`pandas.DataFrame`
            Not used. Provided for compatibility with ChIPSeqAnalysis class.
        peak_dir: :obj:`str`
            Not used. Provided for compatibility with ChIPSeqAnalysis class.

        Raises
        ----------
        IOError
            If not ``permissive`` and either the ``peaks`` or ``summits`` file of a sample is not readable.
            Or if ``permissive`` but none of the samples has an existing file.

        Attributes
        ----------
        support : :obj:`pandas.DataFrame`
            A dataframe with counts of peaks overlapping each feature of consensus set.
        """
        # TODO: Implement distributed
        from tqdm import tqdm
        import pybedtools
        import tempfile
        from ngs_toolkit.utils import bed_to_index

        if samples is None:
            samples = self.samples

        # Check which samples to run (dependent on permissive)
        samples = self._get_samples_with_input_file(
            region_type, permissive=permissive, samples=samples)

        # calculate support (number of samples overlaping each merged peak)
        for i, sample in tqdm(enumerate(samples), total=len(samples), desc="Sample"):
            if region_type == "summits":
                peaks = sample.summits
            else:
                peaks = sample.peaks
            # print(sample, peaks)
            if i == 0:
                support = self.sites.intersect(peaks, wa=True, c=True)
            else:
                support = support.intersect(peaks, wa=True, c=True)

        try:
            support = support.to_dataframe()
        except (
            ValueError,
            pybedtools.MalformedBedLineError,
            pybedtools.helpers.BEDToolsError,
            OverflowError,
        ):
            _LOGGER.debug(
                "Could not convert support intersection directly to dataframe,"
                "saving/reading temporary file."
            )
            t = tempfile.NamedTemporaryFile()
            support.saveas(t.name)
            support = pd.read_csv(t.name, sep="\t", header=None)

        support.columns = ["chrom", "start", "end"] + [
            sample.name for sample in samples
        ]
        support.index = bed_to_index(support)
        support.to_csv(
            os.path.join(self.results_dir, self.name + ".binary_overlap_support.csv"),
            index=True,
        )

        # divide sum (of unique overlaps) by total to get support value between 0 and 1
        support.loc[:, "support"] = support[[sample.name for sample in samples]].apply(
            lambda x: sum([i if i <= 1 else 1 for i in x]) / float(len(samples)), axis=1
        )
        # save
        support.to_csv(
            os.path.join(self.results_dir, self.name + ".support.csv"), index=True
        )

        setattr(self, "support", support)
        return self.support

[docs]    def get_supported_peaks(self, samples=None, **kwargs):
        """
        Get mask of sites with 0 support in the given samples.
        Requires support matrix produced by `ngs_toolkit.atacseq.ATACSeqAnalysis.calculate_peak_support`.

        Parameters
        ----------
        samples : :obj:`list`
            Iterable of :class:`peppy.Sample` objects to restrict to.

        **kwargs
            Not used. Provided for compatibility with ChIPSeqAnalysis class.

        Returns
        -------
        pd.Series
            Boolean Pandas Series with sites with at least one of the given samples having a peak called.
        """
        if samples is None:
            samples = self.samples
        return self.support.loc[:, [s.name for s in samples]].sum(1) != 0

[docs]    def measure_coverage(
        self,
        samples=None,
        sites=None,
        save=True,
        assign=True,
        peak_set_name="peak_set",
        output_file="{results_dir}/{name}.matrix_raw.csv",
        permissive=False,
        distributed=False,
        overwrite=True,
        **kwargs
    ):
        """
        Measure read coverage (counts) of each sample in each region
        in consensus sites.
        Uses parallel computing using the :class:`parmap` library.
        However, for many samples (hundreds), parallelization in a
        computing cluster is possible with the `distributed` option.

        Parameters
        ----------
        samples : :obj:`list`
            Iterable of :class:`peppy.Sample` objects to restrict to.
            Must have a ``aligned_filtered_bam`` attribute set.

            Defaults to all samples in the analysis (``samples`` attribute).
        sites : {:class:`pybedtools.bedtool.BedTool`, :class:`pandas.DataFrame`, :obj:`str`}
            Sites in the genome to quantify, usually a :class:`pybedtools.bedtool.BedTool`
            from :class:`ngs_toolkit.atacseq.ATACSeqAnalysis.get_consensus_sites`.
            If a DataFrame, will try to convert to BED format assuming first
            three columns are chr,start,end.
            If a string assumes a path to a BED file.

            Defaults to ``sites`` attribute of analysis object.
        save : :obj:`bool`
            Whether to save to disk the coverage matrix with filename ``output_file``.

            Default is :obj:`True`.
        assign : :obj:`bool`
            Whether to assign the matrix to an attribute named ``coverage``.

            Default is :obj:`True`.
        peak_set_name : :obj:`bool`
            Suffix to files containing coverage of ``distributed`` is True.

            Defaults to "peak_set".
        output_file : :obj:`str`
            A path to a CSV file with coverage output.

            Default is "{results_dir}/{name}.raw_coverage.csv".
        permissive : :obj:`bool`
            Whether Samples for which ``region_type`` attribute file does not exist
            should be simply skipped or an error thrown.

            Default is :obj:`False`.
        distributed : :obj:`bool`
            Whether it should be run as jobs for each sample
            separately in parallel.
            Currently only implemented for a SLURM cluster.

            Default is :obj:`False`.
        overwrite : :obj:`bool`
            Whether to overwrite existing files if ``distributed`` is True.

            Default is :obj:`True`.
        **kwargs : :obj:`dict`
            Additional keyword arguments will be passed to
            `ngs_toolkit.utils.submit_job` if `distributed` is True,
            and on to a divvy submission template.
            Pass for example: computing_configuration="slurm", jobname="job",
            cores=2, mem=8000, partition="longq".

        Raises
        ----------
        IOError
            If not ``permissive`` and the 'aligned_filtered_bam'
            file attribute of a sample is not readable.
            Or if ``permissive`` but none of the samples has an existing file.

        Attributes
        ----------
        matrix_raw : :class:`pandas.DataFrame`
            The dataframe of raw coverage values (counts) of
            shape (n_features, m_samples).

        Returns
        -------
        :class:`pandas.DataFrame`
            Pandas DataFrame with read counts of shape (n_sites, m_samples).
        """
        import sys
        import multiprocessing
        import parmap

        from ngs_toolkit.utils import count_reads_in_intervals, submit_job, to_bed_index

        if samples is None:
            samples = self.samples

        output_file = self._format_string_with_attributes(output_file)

        # Check which samples to run (dependent on permissive)
        samples = self._get_samples_with_input_file(
            "aligned_filtered_bam", samples=samples, permissive=permissive
        )

        if sites is None:
            sites = self.sites

        if not distributed:
            # Count reads with pysam
            # make strings with intervals
            sites_str = to_bed_index(sites)
            # count, create dataframe
            matrix_raw = pd.DataFrame(
                map(
                    pd.Series,
                    parmap.map(
                        count_reads_in_intervals,
                        [sample.aligned_filtered_bam for sample in samples],
                        sites_str,
                        pm_parallel=True,
                    ),
                ),
                index=[sample.name for sample in samples],
            ).T

            if assign:
                self.matrix_raw = matrix_raw
            if save:
                matrix_raw.to_csv(output_file, index=True)
            return matrix_raw

        else:
            for s in samples:
                output_dir = os.path.join(s.paths.sample_root, "coverage")
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                job_name = "{}.{}_coverage".format(peak_set_name, s.name)
                prefix = os.path.join(output_dir, s.name + ".{}_coverage"
                                      .format(peak_set_name))
                output_file = prefix + ".bed"
                log_file = prefix + ".log"
                job_file = prefix + ".sh"
                if not overwrite:
                    if os.path.exists(output_file):
                        continue

                cmd = (
                    "\\\n".join([
                        "{executable} -m ngs_toolkit.recipes.coverage ",
                        "--no-overwrite" if not overwrite else "",
                        "{input_bed} {input_bam} {output_bed}"])
                    .format(
                        executable=sys.executable,
                        input_bed=sites.fn,
                        input_bam=s.aligned_filtered_bam,
                        output_bed=output_file
                    )
                )
                for k, v in [("cores", 1), ("mem", 8000), ("time", "04:00:00")]:
                    if k not in kwargs:
                        kwargs[k] = v

                submit_job(
                    cmd, job_file,
                    jobname=job_name,
                    logfile=log_file,
                    **kwargs)
            if getattr(kwargs, "computing_configuration", None) in ["localhost", "default"]:
                _LOGGER.info("Collecting job results.")
                return self.collect_coverage(
                    samples=samples, save=save, assign=assign,
                    output_file=output_file, permissive=permissive)

[docs]    def collect_coverage(
        self,
        samples=None,
        save=True,
        assign=True,
        output_file=None,
        permissive=False,
        peak_set_name="peak_set",
        fast_and_unsafe=False
    ):
        """
        Collect read coverage (counts) of each sample in each region in consensus sites from existing files.
        Useful after runnning analysis.measure_coverage() in distributed mode.

        Parameters
        ----------
        samples : :obj:`list`
            Iterable of :class:`peppy.Sample` objects to restrict to.
            If not provided (`None` is passed) if will default to all samples in the analysis (``samples`` attribute).

        save: :obj:`bool`
            Whether to save to disk the coverage matrix with filename ``output_file``.

        assign: :obj:`bool`
            Whether to assign the matrix to an attribute of self named ``coverage``.

        output_file : :obj:`str`
            A path to a CSV file with coverage output.

            Default is "{results_dir}/{name}.raw_coverage.csv".
        permissive: :obj:`bool`
            Whether Samples without an existing coverage file does not exist
            should be simply skipped or an error thrown.

        peak_set_name: :obj:`bool`
            Suffix to files containing coverage.
            Defaults to "peak_set".

        fast_and_unsafe: :obj:`bool`
            Whether to use a faster but unsafer method to concatenate the data.
            If the order of all rows in all samples is the same then the result should be the same.
            The default, slower method assures that all rows are matched and is therefore slower.

            Defaults to :obj:`False`.

        Raises
        ----------
        IOError
            If not ``permissive`` and the coverage file of a sample is not readable or is empty.
            Or if ``permissive`` but none of the samples has an existing file or are empty.

        Attributes
        ----------
        matrix_raw : :class:`pandas.DataFrame`
            The dataframe of raw coverage values (counts) of shape (n_features, m_samples).

        Returns
        -------
        :class:`pandas.DataFrame`
            Pandas DataFrame with read counts of shape (n_sites, m_samples).
        """
        from tqdm import tqdm

        from ngs_toolkit.utils import bed_to_index

        if samples is None:
            samples = self.samples

        for sample in samples:
            setattr(
                sample,
                "_coverage",
                os.path.join(
                    sample.paths.sample_root,
                    "coverage",
                    sample.name + ".{}_coverage.bed".format(peak_set_name),
                ),
            )

        # Check which samples to run (dependent on permissive)
        samples = self._get_samples_with_input_file(
            "_coverage", permissive=permissive, samples=samples
        )

        # Read in counts
        matrix_raw = list()
        for sample in tqdm(samples, total=len(samples)):
            cov = pd.read_csv(
                sample._coverage,
                sep="\t",
                header=None,
                names=["chrom", "start", "end", sample.name],
            )

            if cov.empty:
                msg = "Coverage file for sample '{}' is empty!".format(sample.name)
                if not permissive:
                    _LOGGER.error(msg)
                    raise IOError(msg)
                else:
                    _LOGGER.warning(msg)
                    continue
            matrix_raw.append(cov)

        if len(matrix_raw) == 0:
            msg = "No sample had a valid coverage file!"
            if permissive:
                _LOGGER.warning(msg)
                return
            else:
                _LOGGER.error(msg)
                raise IOError(msg)

        if fast_and_unsafe:
            _LOGGER.warning("Using a concatenation method that is not 100% safe.")
            matrix_raw = pd.concat(matrix_raw, axis=1, sort=False).dropna().sort_index()
            matrix_raw = matrix_raw.loc[:, ~matrix_raw.columns.duplicated()].set_index(["chrom", "start", "end"]).astype(int)
        else:
            matrix_raw = (
                pd.concat(matrix_raw, axis=0, sort=False)
                .melt(id_vars=["chrom", "start", "end"])
                .pivot_table(
                    index=["chrom", "start", "end"], columns="variable",
                    values="value", fill_value=0
                )
                .astype(int)
            )
        matrix_raw.index = bed_to_index(matrix_raw.index.to_frame())

        if assign:
            self.matrix_raw = matrix_raw
        if save:
            if output_file is not None:
                matrix_raw.to_csv(output_file, index=True)
            else:
                self.matrix_raw.to_csv(
                    os.path.join(self.results_dir, self.name + ".matrix_raw.csv"),
                    index=True,
                )
        return matrix_raw

[docs]    @check_has_attributes(['organism', 'genome'])
    def get_peak_gccontent_length(self, bed_file=None, fasta_file=None):
        """
        Get length and GC content of features in region set.

        bed_file : :obj:`str`
            A BED file with regions to calculate GC content on. Must be a 3-column BED!
            If not provided the calculation will be for the analysis `sites` attribute.

        genome : :obj:`str`
            Genome assembly.

        fasta_file : :obj:`str`
            Fasta file of `genome`. Preferably indexed. If not given, will try to download.

        :var nuc:
            DataFrame with nucleotide content and length of each region.

        Returns
        -------
        pandas.DataFrame
            DataFrame with nucleotide content and length of each region.

        Attributes
        ----------
        nuc : :class:`pandas.DataFrame`
            Dataframe with length and GC-content of each feature.
        """
        import pybedtools
        from ngs_toolkit.utils import bed_to_index

        if bed_file is None:
            sites = self.sites
        else:
            sites = pybedtools.BedTool(bed_file)

        if fasta_file is None:
            _LOGGER.info(
                "Reference genome FASTA file was not given, will try to get it."
            )
            _LOGGER.info(
                "Getting genome FASTA file for organism '{}', genome '{}'. ".format(
                    self.organism, self.genome
                )
            )
            fasta_file = self.get_resources(steps=["genome"])["genome_file"]["fasta"]

        nuc = sites.nucleotide_content(fi=fasta_file).to_dataframe(comment="#")[
            ["score", "blockStarts"]
        ]
        nuc.columns = ["gc_content", "length"]
        nuc.index = bed_to_index(sites)

        # get only the sites matching the coverage (not overlapping blacklist)
        self.nuc = nuc.loc[self.matrix_raw.index]

        self.nuc.to_csv(
            os.path.join(self.results_dir, self.name + ".gccontent_length.csv"),
            index=True,
        )

        return self.nuc

[docs]    def normalize_cqn(
            self,
            matrix="matrix_raw", samples=None,
            save=True, assign=True):
        """
        Conditional quantile normalization (CQN) of a matrix.
        It uses GC content and length of regulatory elements as covariates.

        Requires the R package "cqn" to be installed:

        .. highlight:: R
        .. code-block:: R

            if (!requireNamespace("BiocManager", quietly = TRUE))
                install.packages("BiocManager")
            BiocManager::install("cqn")

        Parameters
        ----------
        matrix : :obj:`str`
            Attribute name of matrix to normalize.

            Defaults to "matrix_raw".
        samples : :obj:`list`
            Iterable of :class:`peppy.Sample` objects to restrict matrix to.

            Defaults to all samples in analysis.
        save: :obj:`bool`
            Whether to write normalized DataFrame to disk.

            Default is :obj:`True`.
        assign: :obj:`bool`
            Whether to assign the normalized DataFrame to an attribute `matrix_norm`.

            Default is :obj:`True`.

        Attributes
        ----------
        matrix_norm : :class:`pandas.DataFrame`
            If `assign`, the dataframe with normalized values.
        norm_method : :obj:`str`
            If ``assign``, it is the name of method used to normalize: "cqn".
        """
        from ngs_toolkit.utils import cqn

        # Perform quantile normalization first
        if not hasattr(self, "nuc"):
            self.normalize_quantiles(matrix=matrix, samples=samples)

        # Get GC content and length of each feature
        if not hasattr(self, "nuc"):
            self.get_peak_gccontent_length()

        matrix_norm = cqn(
            matrix=self.get_matrix(matrix=matrix, samples=samples),
            gc_content=self.nuc["gc_content"],
            lengths=self.nuc["length"],
        )

        if save:
            matrix_norm.to_csv(
                os.path.join(self.results_dir, self.name + ".matrix_norm.csv"),
                index=True,
            )
        if assign:
            self.matrix_norm = matrix_norm
            self.norm_method = "cqn"

        return matrix_norm

[docs]    @check_has_attributes(['sites'])
    def get_peak_gene_annotation(
            self,
            tss_file=None,
            max_dist=100000,
            save=True,
            output_prefix="",
            assign=True):
        """
        Annotates peaks with closest gene.
        The annotation reference can either be given in the `tss_file` parameter
        but if ommited, it will be fetched if analysis has `genome` and `organism`
        attributes.
        A dataframe with each feature's distance to the nearest gene is also saved.

        Parameters
        ----------
        tss_file : :obj:`str`, optional
            A valid BED file where the name field (4th column) identifies the gene
            and the strand column (6th column). Other fields will not be used.

            Default is to get gene position annotations.
        max_dist : :obj:`int`, optional
            Maximum absolute distance allowed to perform associations.
            Regions with no genes within the range will have NaN values.

            Default is 100000.
        save: :obj:`bool`, optional
            Whether to write the annotated DataFrame to disk.

            Default is :obj:`True`.
        output_prefix: :obj:`str`, optional
            Prefix to add to output file when save is True.

            Default is "" (empty string).
        assign: :obj:`bool`, optional
            Whether to assign the DataFrames to `Attributes`.

            Default is :obj:`True`.

        Attributes
        ----------
        gene_annotation : :class:`pandas.DataFrame`
            A pandas DataFrame containing the genome annotations of the region features.
            If a feature overlaps more than one gene, the two gene values will be concatenated with a comma.

        closest_tss_distances : :class:`pandas.DataFrame`
            A pandas DataFrame containing unique region->gene associations.
            In contrast to gene_annotation dataframe, this contains one row per region->gene assignment.

        Returns
        -------
        pandas.DataFrame
            A dataframe with genes annotated for the peak set.
        """
        import pybedtools
        from ngs_toolkit.utils import bed_to_index, get_this_file_or_timestamped

        cols = [6, 8, -1]  # gene_name, strand, distance

        if tss_file is None:
            _LOGGER.info(
                "Reference TSS file was not given, will try to get TSS annotations."
            )
            _LOGGER.info(
                "Getting TSS annotations for organism '{}', genome '{}'.".format(
                    self.organism, self.genome
                )
            )
            tss_file = self.get_resources(steps=["tss"])["tss_file"]

        # extract only relevant columns
        tss_file = get_this_file_or_timestamped(tss_file)
        if isinstance(self.sites, str):
            self.sites = pybedtools.BedTool(self.sites)

        # get closest TSS of each region
        tss = pybedtools.BedTool(tss_file)
        columns = [
            "chrom", "start", "end",
            "gene_name", "strand", "distance"]
        closest_tss_distances = (
            self.sites.closest(tss, D="b")
            .to_dataframe())

        closest_tss_distances = closest_tss_distances.iloc[:, [0, 1, 2] + cols]
        closest_tss_distances.columns = columns

        # set NaN to distance without assignment (rather than the default '-1' from bedtools)
        closest_tss_distances.loc[
            closest_tss_distances["gene_name"] == ".", "distance"
        ] = np.nan

        # set NaN to assignments out of range
        closest_tss_distances.loc[
            closest_tss_distances["distance"].abs() > max_dist,
            ["gene_name", "strand", "distance"],
        ] = np.nan

        # aggregate annotation per peak, concatenate various genes (comma-separated)
        gene_annotation = (
            closest_tss_distances.groupby(["chrom", "start", "end"])
            .aggregate(lambda x: ",".join(set([str(i) for i in x if i != "."])))
            .reset_index()
        )
        closest_tss_distances.index = bed_to_index(closest_tss_distances)
        gene_annotation.index = bed_to_index(gene_annotation)

        # save to disk
        if save:
            if output_prefix != "":
                output_prefix += "."
            closest_tss_distances.to_csv(
                os.path.join(self.results_dir, self.name + ".closest_tss_distances.{}csv".format(output_prefix)),
                index=True,
            )
            gene_annotation.to_csv(
                os.path.join(self.results_dir, self.name + ".gene_annotation.{}csv".format(output_prefix)),
                index=True,
            )
        if assign:
            self.closest_tss_distances = closest_tss_distances
            self.gene_annotation = gene_annotation
        return gene_annotation

[docs]    @check_has_attributes(['organism', 'genome', 'sites'])
    def get_peak_genomic_location(
            self,
            genomic_context_file=None,
            save=True,
            output_prefix="",
            assign=True):
        """
        Annotates a consensus peak set (``sites`` attribute of analysis) with their genomic context.
        The genomic context is mostly gene-centric, which includes overlap with
        gene promoters, UTRs, exons, introns and remaining intergenic space.

        If no reference genomic annotation file is given (genomic_context_file kwarg),
        it will use the ngs_toolkit.general.get_genomic_context function to get
        such data. For more customization of the annotations, use that function directly and pass
        the output file to this function.

        Parameters
        ----------
        genomic_context_file : :obj:`str`
            A 4 column BED file (chrom, start, end, feature), where feature is a string with the type of region.
            If not provided will be get with the get_genomic_context function.
        save: :obj:`bool`, optional
            Whether to write the annotated DataFrame to disk.

            Default is :obj:`True`.
        output_prefix: :obj:`str`, optional
            Prefix to add to output file when save is True.

            Default is "" (empty string).
        assign: :obj:`bool`, optional
            Whether to assign the DataFrames to `Attributes`.

            Default is :obj:`True`.

        Attributes
        ----------
        region_annotation, region_annotation_b : :class:`pandas.DataFrame`
            A DataFrame with the genome annotations of the region features or genome background.

        region_annotation_mapping, region_annotation_b_mapping : :class:`pandas.DataFrame`
            A DataFrame with one row for each chromatin state-region mapping or genome background.

        Returns
        -------
        pandas.DataFrame
            The genomic context annotation for the peak set.
        """
        import pybedtools

        from ngs_toolkit.utils import bed_to_index, get_this_file_or_timestamped

        if genomic_context_file is None:
            _LOGGER.info(
                "Reference genomic context file was not given, will try to get it."
            )
            _LOGGER.info(
                "Getting genomic context annotations for organism '{}', genome '{}'. ".format(
                    self.organism, self.genome
                )
            )
            genomic_context_file = self.get_resources(steps=["genomic_context"])[
                "genomic_context_file"
            ]

        context = pybedtools.BedTool(get_this_file_or_timestamped(genomic_context_file))

        if isinstance(self.sites, str):
            self.sites = pybedtools.BedTool(self.sites)

        # create background
        # shuffle regions in genome to create background (keep them in the same chromossome)
        background = self.sites.shuffle(genome=self.genome, chrom=True)

        cols = [0, 1, 2, -1]
        for label, attr, bed in [
            ("background", "region_annotation_b", background),
            ("real", "region_annotation", self.sites),
        ]:
            annot = (
                bed.intersect(context, wa=True, wb=True, f=0.2)
                .sort()
                .to_dataframe()
                .iloc[:, cols]
            )
            annot.index = bed_to_index(annot)
            annot.columns = ["chrom", "start", "end", "genomic_region"]

            # remove duplicates (there shouldn't be anyway)
            annot = annot.drop_duplicates()
            # join various annotations per peak
            annot_comp = (
                annot.groupby(["chrom", "start", "end"])
                .aggregate(lambda x: ",".join(set([str(i) for i in x])))
                .reset_index()
            )
            annot_comp.index = bed_to_index(annot_comp)
            annot_comp.columns = ["chrom", "start", "end", "genomic_region"]
            # save to disk
            if save:
                a = "" if (label == "real") else ("_" + label)
                if output_prefix != "":
                    output_prefix += "."
                annot.to_csv(
                    os.path.join(
                        self.results_dir,
                        self.name + ".region_annotation{}_mapping.{}csv".format(a, output_prefix),
                    ),
                    index=True,
                )
                annot_comp.to_csv(
                    os.path.join(
                        self.results_dir, self.name + ".region_annotation{}.{}csv".format(a, output_prefix)
                    ),
                    index=True,
                )

            if assign:
                setattr(self, attr, annot_comp)
                setattr(self, attr + "_mapping", annot)
        return self.region_annotation

[docs]    @check_has_attributes(['organism', 'genome', 'sites'])
    def get_peak_chromatin_state(
            self,
            chrom_state_file,
            frac=0.2,
            save=True,
            output_prefix="",
            assign=True):
        """
        Annotates a consensus peak set (``sites`` attribute of analysis)
        with their chromatin state context.
        This would be given, for example by a chromatin state segmentation
        file from projects such as Roadmap Epigenomics.

        See examples of such files for various cell types/assemblies
        `in this website <https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/>`_
        (the "dense.bed.gz" files are optimal).

        Parameters
        ----------
        chrom_state_file : :obj:`str`
            A 4 column BED file (chrom, start, end, feature), where feature is a string with the type of region.
            Additional columns are ignored.

        frac : float
            Minimal fraction of region to overlap with a feature.

            Defaults to 0.2.
        save: :obj:`bool`, optional
            Whether to write the annotated DataFrame to disk.

            Default is :obj:`True`.
        output_prefix: :obj:`str`, optional
            Prefix to add to output file when save is True.

            Default is "" (empty string).
        assign: :obj:`bool`, optional
            Whether to assign the DataFrames to `Attributes`.

            Default is :obj:`True`.

        Returns
        ----------
        pandas.DataFrame
            The chromatin state annotation for the peak set.

        Attributes
        ----------
        chrom_state_annotation, chrom_state_annotation_b : :class:`pandas.DataFrame`
            A DataFrame with the chromatin state annotations of
            the region features or of the genome background.

        chrom_state_annotation_mapping, chrom_state_annotation_b_mapping : :class:`pandas.DataFrame`
            A DataFrame with one row for each chromatin state-region mapping
            or for the genome background.
        """
        import pybedtools
        from ngs_toolkit.utils import bed_to_index

        states = pybedtools.BedTool(chrom_state_file)

        if isinstance(self.sites, str):
            self.sites = pybedtools.BedTool(self.sites)

        # create background
        # shuffle regions in genome to create background (keep them in the same chromossome)
        background = self.sites.shuffle(genome=self.genome, chrom=True)

        for label, attr, bed in [
            ("real", "chrom_state_annotation", self.sites),
            ("background", "chrom_state_annotation_b", background)
        ]:
            _LOGGER.debug(
                "Overlapping chromatin state annotation with {} regions.".format(label)
            )
            annot = bed.intersect(
                states, wa=True, wb=True, f=frac, loj=True
            )
            try:
                annot = annot.to_dataframe(usecols=[0, 1, 2, 6])
            except pd.errors.EmptyDataError:
                _LOGGER.error("Could not annotate region set.")
                raise

            annot.iloc[:, 3] = annot.iloc[:, 3].astype(str)

            # remove duplicates (there shouldn't be anyway)
            annot = annot.drop_duplicates()
            annot.index = bed_to_index(annot)
            annot.columns = ["chrom", "start", "end", "chromatin_state"]
            # join various annotations per peak
            annot_comp = (
                annot.groupby(["chrom", "start", "end"])
                .aggregate(lambda x: ",".join(set([str(i) for i in x])))
                .reset_index()
            )
            annot_comp.index = bed_to_index(annot_comp)
            annot_comp.columns = ["chrom", "start", "end", "chromatin_state"]
            # save to disk
            if save:
                a = "" if (label == "real") else ("_" + label)
                if output_prefix != "":
                    output_prefix += "."
                annot.to_csv(
                    os.path.join(
                        self.results_dir,
                        self.name + ".chrom_state_annotation{}_mapping.{}csv".format(a, output_prefix),
                    ),
                    index=True,
                )
                annot_comp.to_csv(
                    os.path.join(
                        self.results_dir,
                        self.name + ".chrom_state_annotation{}.{}csv".format(a, output_prefix),
                    ),
                    index=True,
                )

            if assign:
                setattr(self, attr, annot_comp)
                setattr(self, attr + "_mapping", annot)
        return self.chrom_state_annotation

[docs]    def get_sex_chrom_ratio(
        self,
        matrix="matrix_norm",
        sex_chroms=["chrX", "chrY"],
        output_dir="{results_dir}",
        output_prefix="sex_chrom_ratio",
        plot=True,
    ):
        """
        Get ratio of signal between sex chromosomes.
        Useful to quickly assign sex to samples.

        Parameters
        ----------
        matrix : :obj:`pandas.DataFrame`, optional
            Matrix to use.
            Defaults to `matrix_norm`.

        sex_chroms : :obj:`list`, optional
            Names of the two sex chromosomes to use.

        output_dir : :obj:`str`, optional
            Directory to write output to.

        output_prefix : :obj:`str`, optional
            String to prefix output with.

        plot: :obj:`bool`, optional
            Whether to produce illustrative plots.

        Returns
        ----------
        pd.Series
            Ratio of sex chromosomes defined as
            `sex_chroms[1] - sex_chroms[0]`.
        """
        import matplotlib.pyplot as plt
        import seaborn as sns
        from natsort import natsorted
        from ngs_toolkit.graphics import savefig

        matrix = self.get_matrix(matrix)

        output_dir = self._format_string_with_attributes(output_dir)

        # remove per sample mean
        matrix -= matrix.mean()

        chroms = matrix.index.str.extract(r"^(.*):\d+-\d+$").iloc[:, 0].values
        if not all([x in chroms for x in sex_chroms]):
            msg = f"Requested sex chromosomes {', '.join(sex_chroms)} not found in matrix."
            _LOGGER.error(msg)
            raise ValueError
        matrix = matrix.assign(chrom=chroms)
        m = matrix.groupby("chrom").mean()
        # remove per chromosome mean
        m = (m.T - m.mean(1)).T
        # order chromosomes
        m = m.reindex(natsorted(m.index))

        # calculate ratio
        ratio = m.loc[sex_chroms[1]] - m.loc[sex_chroms[0]]
        ratio.name = "{}_to_{}_ratio".format(sex_chroms[1], sex_chroms[0])
        ratio.to_csv(os.path.join(output_dir, self.name + "." + output_prefix + ".csv"), header=True)

        if plot:
            ratio.sort_values(inplace=True)
            m = m.reindex(ratio.index, axis=1)

            # Clustermap
            if isinstance(ratio.index, pd.MultiIndex):
                cols = m.columns.get_level_values("sample_name")
            else:
                cols = m.columns
            grid = sns.clustermap(
                m.T,
                z_score=1,
                center=0,
                cmap="RdBu_r",
                figsize=(m.shape[0] * 0.3, m.shape[1] * 0.3),
                row_cluster=False,
                col_cluster=False,
                cbar_kws={"label": "Deviation from mean\nchromosome accessibility"},
                yticklabels=cols,
            )
            grid.ax_heatmap.set_xlabel("Chromosomes")
            grid.ax_heatmap.set_ylabel("Samples")
            savefig(grid, os.path.join(output_dir, self.name + "." + output_prefix + ".clustermap.svg"))

            # Value vs Rank
            fig, axis = plt.subplots(1, figsize=(3, ratio.shape[0] * 0.3))
            axis.scatter(ratio, ratio.rank(), linestyle="-")
            axis.axvline(0, linestyle="--", color="grey")
            axis.set_yticks(range(1, ratio.shape[0] + 1))
            if isinstance(ratio.index, pd.MultiIndex):
                axis.set_yticklabels(ratio.index.get_level_values("sample_name"))
            else:
                axis.set_yticklabels(ratio.index)
            axis.set_ylabel("Samples")
            axis.set_xlabel(ratio.name.replace("_", " "))
            v = ratio.abs().max()
            v += v * 0.1
            axis.set_xlim((-v, v))
            sns.despine(fig)
            savefig(fig, os.path.join(output_dir, self.name + "." + output_prefix + ".rank_vs_ratio.svg"))

        return ratio.sort_index()

[docs]    def get_gene_level_matrix(
            self,
            matrix="matrix_norm",
            reduce_func=np.mean,
            assign=True,
            save=True,
            output_file="{results_dir}/{name}.gene_coverage.csv"):
        """
        Get gene-level measurements of coverage.

        Requires a 'gene_annotation' or 'closest_tss_distances' attribute to be set
        containing a mapping between the index of `matrix` and genes
        (produced from `get_peak_gene_annotation`).

        Parameters
        ----------
        matrix : :obj:`str`, optional
            Quantification matrix to use (e.g. 'matrix_raw' or 'matrix_norm')

            Default is "matrix_norm".
        reduce_func : func
            Function to apply to reduce values.

            Default is mean.
        assign: :obj:`bool`
            Whether to assign the matrix to an attribute of self named `matrix_gene`.

            Default is :obj:`True`.
        save: :obj:`bool`
            Whether to save to disk the coverage matrix with filename `output_file`.

            Default is :obj:`True`.
        output_file : :obj:`str`
            Path to save a CSV file with coverage output if `save` is `True`.

            Default is `self.results_dir/self.name + ".raw_coverage.csv"`.

        Returns
        ---------
        pandas.DataFrame
            Coverage values reduced per gene.

        Attributes
        ---------
        matrix_gene : :obj:`pandas.DataFrame`
            Coverage values reduced per gene.
        """
        msg = "Analysis object lacks a 'gene_annotation' or 'closest_tss_distances' dataframe."
        hint = " Call 'analysis.get_peak_gene_annotation' to have region-gene associations."
        if not (hasattr(self, "gene_annotation") or hasattr(self, "closest_tss_distances")):
            _LOGGER.error(msg + hint)
            raise AssertionError(msg)

        matrix = self.get_matrix(matrix).copy()

        if hasattr(self, "closest_tss_distances"):
            matrix2 = matrix.join(self.closest_tss_distances[['gene_name']])
            matrix2 = matrix2.set_index('gene_name', append=True)
        else:
            g = self.gene_annotation["gene_name"].str.split(",").apply(pd.Series).stack()
            g.index = g.index.droplevel(1)
            g.name = "gene_name"
            matrix2 = matrix.join(g).drop("gene_name", axis=1)
            matrix2.index = (
                matrix.join(g).reset_index().set_index(["index", "gene_name"]).index
            )

        matrix2.columns = matrix.columns
        matrix3 = matrix2.groupby(level="gene_name").apply(reduce_func)
        matrix3 = matrix3.loc[:, ~matrix3.isnull().all()]
        if assign:
            self.matrix_gene = matrix3
        if save:
            matrix3.to_csv(self._format_string_with_attributes(output_file))
        return matrix3

[docs]    def get_gene_level_changes(
            self,
            differential_results=None, reduce_func=np.mean):
        """
        Redcuce changes in regulatory elements to gene-level by
        aggregating across regulatory elements.
        Requires a 'gene_annotation' attribute to be set containing
        a mapping between the index of `matrix` and genes (produced
        from `get_peak_gene_annotation`).

        Parameters
        ----------
        differential_results : :obj:`pandas.DataFrame`
            Matrix with differential results to use.
            Default is a 'differential_results' attribute of self.

        reduce_func : func
            Function to apply to reduce values. Default is mean

        Returns
        ---------
        pandas.DataFrame
            Changes in chromatin accessibility (log2FoldChanges)
            reduced per gene.
        """
        msg = "Analysis object lacks a 'gene_annotation' or 'closest_tss_distances' dataframe."
        hint = " Call 'analysis.get_peak_gene_annotation' to have region-gene associations."
        if not (hasattr(self, "gene_annotation") or hasattr(self, "closest_tss_distances")):
            _LOGGER.error(msg + hint)
            raise AssertionError(msg)

        if differential_results is None:
            differential_results = self.differential_results

        if hasattr(self, "closest_tss_distances"):
            dr2 = differential_results.join(self.closest_tss_distances[['gene_name']])
            dr2 = dr2.set_index("gene_name", append=True)
        else:
            g = self.gene_annotation["gene_name"].str.split(",").apply(pd.Series).stack()
            g.index = g.index.droplevel(1)
            g.name = "gene_name"

            dr2 = differential_results.join(g).drop("gene_name", axis=1)
            dr2.index = (
                differential_results.join(g)
                .reset_index()
                .set_index(["index", "gene_name"])
                .index
            )

        dr2.columns = differential_results.columns
        dr3 = (
            dr2.reset_index()
            .groupby(["gene_name", "comparison_name"])
            .apply(reduce_func)
        )

        return dr3.loc[:, ~dr3.isnull().all()]

[docs]    def plot_peak_characteristics(
        self,
        samples=None,
        by_attribute=None,
        genome_space=3e9,
        output_dir="{results_dir}/peak_characteristics",
        output_prefix="{name}"
    ):
        """
        Several diagnostic plots on the analysis' consensus peak set
        and the sample's signal on them.

        Provides plots with samples grouped `by_attribute` if given (a string or a list of strings).

        Parameters
        ----------
        samples : :obj:`list`, optional
            List of samples to restrict analysis to.

        by_attribute : {str, list}, optional
            Attribute or list of sample attributes to groupby samples by when plotting.
            This is done in addition to the plots with individual values per sample.

        genome_space : :obj:`int`
            Length of genome.

            Defaults to 3e9 basepairs (human genome).

        output_dir : :obj:`str`
            Directory to output files. Will be formated with variables from Analysis.

            Defaults to "peak_characteristics" under the Analysis "results_dir".
        output_prefix : :obj:`str`
            Prefix to add to output files.

            Defaults to the Analysis' name.
        """
        # TODO: abstract genome space kwarg
        import multiprocessing
        import parmap
        import matplotlib.pyplot as plt
        import seaborn as sns

        from ngs_toolkit.utils import (
            count_bam_file_length,
            count_lines,
            get_total_region_area,
            get_region_lengths,
            get_regions_per_chromosomes,
        )
        from ngs_toolkit.graphics import savefig

        if samples is None:
            samples = self.samples

        output_dir = self._format_string_with_attributes(output_dir)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        output_prefix = self._format_string_with_attributes(output_prefix)

        reads = parmap.map(
            count_bam_file_length, [s.aligned_filtered_bam for s in samples]
        )
        peaks = list(map(count_lines, [s.peaks for s in samples]))
        open_chrom = list(map(get_total_region_area, [s.peaks for s in samples]))

        stats = pd.DataFrame(
            [reads, peaks, open_chrom],
            index=["reads_used", "peak_number", "open_chromatin"],
            columns=[s.name for s in samples],
        ).T

        stats["peaks_norm"] = (stats["peak_number"] / stats["reads_used"]) * 1e3
        stats["open_chromatin_norm"] = stats["open_chromatin"] / stats["reads_used"]
        stats.to_csv(
            os.path.join(
                output_dir, "{}.open_chromatin_space.csv".format(output_prefix)
            ),
            index=True,
        )
        # stats = pd.read_csv(os.path.join(
        #         output_dir,
        #         "{}.open_chromatin_space.csv"
        #         .format(output_prefix)),
        #     index_col=0)

        # median lengths per sample (split-apply-combine)
        if by_attribute is not None:
            stats = stats.join(self.get_sample_annotation())
            stats = pd.merge(
                stats,
                stats.groupby(by_attribute)["open_chromatin"]
                .median()
                .to_frame(name="group_open_chromatin")
                .reset_index(),
            ).set_index(stats.index)
            stats = pd.merge(
                stats,
                stats.groupby(by_attribute)["open_chromatin_norm"]
                .median()
                .to_frame(name="group_open_chromatin_norm")
                .reset_index(),
            ).set_index(stats.index)

        # plot
        stats = stats.sort_values("open_chromatin_norm")
        fig, axis = plt.subplots(2, 1, figsize=(1 * 3, 2 * 3))
        for ax, var in zip(axis, ['open_chromatin', 'open_chromatin_norm']):
            sns.barplot(
                y="index",
                x=var,
                orient="horiz",
                data=stats.reset_index(),
                palette="summer",
                ax=ax,
            )
            ax.set_ylabel("Sample name")
        axis[0].set_xlabel("Total open chromatin space (bp)")
        axis[1].set_xlabel("Total open chromatin space (normalized)")
        sns.despine(fig)
        savefig(
            fig,
            os.path.join(
                output_dir,
                "{}.total_open_chromatin_space.per_sample.svg".format(output_prefix),
            ),
        )

        if by_attribute is not None:
            fig, axis = plt.subplots(2, 1, figsize=(4 * 2, 6 * 1))
            stats = stats.sort_values("group_open_chromatin")
            sns.barplot(
                y=by_attribute,
                x="open_chromatin",
                orient="horiz",
                data=stats.reset_index(),
                palette="summer",
                ax=axis[0],
            )
            sns.stripplot(
                y=by_attribute,
                x="open_chromatin",
                orient="horiz",
                data=stats.reset_index(),
                palette="summer",
                ax=axis[0],
            )
            stats = stats.sort_values("group_open_chromatin_norm")
            sns.barplot(
                y=by_attribute,
                x="open_chromatin_norm",
                orient="horiz",
                data=stats.reset_index(),
                palette="summer",
                ax=axis[1],
            )
            sns.stripplot(
                y=by_attribute,
                x="open_chromatin_norm",
                orient="horiz",
                data=stats.reset_index(),
                palette="summer",
                ax=axis[1],
            )
            axis[0].set_xlabel("Total open chromatin space (bp)")
            axis[1].set_xlabel("Total open chromatin space (normalized)")
            sns.despine(fig)
            savefig(
                fig,
                os.path.join(
                    output_dir,
                    "{}.total_open_chromatin_space.per_{}.svg".format(
                        self.name, by_attribute
                    ),
                ),
            )

        # plot distribution of peak lengths
        sample_peak_lengths = map(get_region_lengths, [s.peaks for s in samples])
        lengths = pd.melt(
            pd.DataFrame(sample_peak_lengths, index=[s.name for s in samples]).T,
            value_name="peak_length",
            var_name="sample_name",
        ).dropna()

        # median lengths per sample (split-apply-combine)
        lengths = pd.merge(
            lengths,
            lengths.groupby("sample_name")["peak_length"]
            .median()
            .to_frame(name="mean_peak_length")
            .reset_index()
            .sort_values("mean_peak_length"),
        )

        fig, axis = plt.subplots(2, 1, figsize=(3 * 1, 3 * 2), sharex=False)
        for ax in axis:
            sns.boxplot(
                y="sample_name",
                x="peak_length",
                orient="horiz",
                data=lengths,
                palette="summer",
                ax=ax,
                showfliers=False,
            )
            ax.set_ylabel("Sample name")
            ax.set_xlabel("Peak length (bp)")
        axis[1].set_xscale("log")
        sns.despine(fig)
        savefig(
            fig,
            os.path.join(
                output_dir, "{}.peak_lengths.per_sample.svg".format(output_prefix)
            ),
        )

        if by_attribute is not None:
            lengths = lengths.merge(self.get_sample_annotation())
            lengths = pd.merge(
                lengths,
                lengths.groupby(by_attribute)["peak_length"]
                .median()
                .to_frame(name="group_mean_peak_length")
                .reset_index()
                .sort_values("group_mean_peak_length"),
            ).set_index(lengths.index)

            fig, axis = plt.subplots(2, 1, figsize=(3 * 1, 3 * 2), sharex=False)
            for ax in axis:
                sns.boxplot(
                    y=by_attribute,
                    x="peak_length",
                    orient="horiz",
                    data=lengths,
                    palette="summer",
                    ax=ax,
                    showfliers=False,
                )
                ax.set_xlabel("Peak length (bp)")
            axis[1].set_xscale("log")
            sns.despine(fig)
            savefig(
                fig,
                os.path.join(
                    output_dir,
                    "{}.peak_lengths.per_{}.svg".format(output_prefix, by_attribute),
                ),
            )

        # peaks per chromosome per sample
        chroms = (
            pd.DataFrame(
                map(get_regions_per_chromosomes, [s.peaks for s in samples]),
                index=[s.name for s in samples],
            )
            .fillna(0)
            .T
        )
        chroms_norm = (chroms / chroms.sum(axis=0)) * 100

        if hasattr(self, "organism"):
            if self.organism == "human":
                chroms_norm = chroms_norm.loc[
                    ["chr{}".format(i) for i in list(range(1, 23)) + ["X", "Y", "M"]], :
                ]
            elif self.organism == "mouse":
                chroms_norm = chroms_norm.loc[
                    ["chr{}".format(i) for i in list(range(1, 19)) + ["X", "Y", "M"]], :
                ]

            fig, axis = plt.subplots(1, 1, figsize=(8 * 1, 8 * 1))
            sns.heatmap(
                chroms_norm,
                square=True,
                cmap="summer",
                xticklabels=True,
                yticklabels=True,
                cbar_kws={"label": "Normalized accessibility"},
                ax=axis,
            )
            axis.set_xlabel("Sample name")
            axis.set_ylabel("Chromosome")
            axis.set_xticklabels(axis.get_xticklabels(), rotation=90, ha="right")
            axis.set_yticklabels(axis.get_yticklabels(), rotation=0, ha="right")
            savefig(
                fig,
                os.path.join(
                    output_dir, "{}.peak_location.per_sample.svg".format(output_prefix)
                ),
            )

        # Peak set across samples:
        # interval lengths
        fig, axis = plt.subplots(1, 1, figsize=(3, 3))
        sns.distplot(
            [interval.length for interval in self.sites if interval.length < 2000],
            hist=False,
            kde=True,
            ax=axis,
        )
        axis.set_xlabel("Peak width (bp)")
        axis.set_ylabel("Density")
        sns.despine(fig)
        savefig(fig, os.path.join(output_dir, "{}.lengths.svg".format(output_prefix)))

        # plot support
        if hasattr(self, "support"):
            fig, axis = plt.subplots()
            sns.distplot(self.support["support"], bins=40, ax=axis)
            axis.set_ylabel("frequency")
            sns.despine(fig)
            savefig(
                fig, os.path.join(output_dir, "{}.support.svg".format(output_prefix))
            )

        # Plot distance to nearest TSS
        if hasattr(self, "closest_tss_distances"):
            fig, axis = plt.subplots(
                2, 1, figsize=(3 * 1, 3 * 2), sharex=False, sharey=False
            )
            for i, ax in enumerate(axis):
                sns.distplot(
                    self.closest_tss_distances["distance"],
                    bins=1000,
                    kde=True,
                    hist=False if (i % 2 == 0) else True,
                    ax=ax,
                )
                ax.set_xlabel("Distance to nearest TSS (bp)")
                ax.set_ylabel("Density")
            axis[1].set_yscale("log")
            sns.despine(fig)
            savefig(
                fig,
                os.path.join(output_dir, "{}.tss_distance.svg".format(output_prefix)),
            )

        # Plot genomic regions
        datas = list()
        for name, attr, attr_b in [
            (
                "genomic_region",
                "region_annotation_mapping",
                "region_annotation_b_mapping",
            ),
            (
                "chromatin_state",
                "chrom_state_annotation_mapping",
                "chrom_state_annotation_b_mapping",
            ),
        ]:
            if hasattr(self, attr):
                f = getattr(self, attr)
                b = getattr(self, attr_b)
                # count region frequency
                data = f[name].value_counts().sort_values(ascending=False)
                background = b[name].value_counts().sort_values(ascending=False)
                data = data.to_frame(name="foreground").join(
                    background.to_frame(name="background")
                )
                data["fold_change"] = np.log2(data["foreground"] / data["background"])
                data.index.name = "region"

                # plot also % of genome space "used"
                f["length"] = f["end"] - f["start"]
                b["length"] = b["end"] - b["start"]

                s = f.groupby(name)["length"].sum()
                data.loc[:, "percent_genome_space"] = (s / genome_space).dropna() * 100
                datas.append(data)

                # plot together
                g = sns.FacetGrid(
                    data=pd.melt(data.reset_index(), id_vars="region"),
                    col="variable",
                    col_wrap=2,
                    sharex=False,
                    sharey=True,
                    size=2,
                    aspect=1.2,
                )
                g.map(sns.barplot, "value", "region", orient="horiz")
                sns.despine(fig)
                savefig(
                    g,
                    os.path.join(
                        output_dir, "{}.{}s.svg".format(output_prefix, name)
                    ),
                )

        # plot together
        if len(datas) > 1:
            data = pd.concat(datas)
            g = sns.FacetGrid(
                data=pd.melt(data.reset_index(), id_vars="region").sort_values("value"),
                col="variable",
                col_wrap=2,
                sharex=False,
                sharey=True,
                size=3,
                aspect=1.2,
            )
            g.map(sns.barplot, "value", "region", orient="horiz")
            sns.despine(fig)
            savefig(
                g,
                os.path.join(
                    output_dir,
                    "{}.genomic_region_and_chromatin_states.svg".format(output_prefix),
                ),
            )

        # distribution of count statistics
        if hasattr(self, "stats"):
            for attr in self.stats.columns:
                fig, axis = plt.subplots(1, 1, figsize=(3, 3))
                sns.distplot(self.stats.loc[:, attr], hist=False, kde=True, ax=axis)
                sns.despine(fig)
                savefig(
                    fig,
                    os.path.join(
                        output_dir, "{}.{}.distplot.svg".format(output_prefix, attr)
                    ),
                )
        if hasattr(self, "support"):
            attr = "support"
            fig, axis = plt.subplots(1, 1, figsize=(3, 3))
            sns.distplot(self.support.loc[:, attr], hist=False, kde=True, ax=axis)
            sns.despine(fig)
            savefig(
                fig,
                os.path.join(
                    output_dir, "{}.{}.distplot.svg".format(output_prefix, attr)
                ),
            )

        # Pairwise against mean
        if hasattr(self, "stats"):
            stats = self.stats.copy()
            if hasattr(self, "support"):
                stats = stats.join(self.support.loc[:, "support"])
            for attr in stats.columns:
                if attr == "mean":
                    continue
                p = stats[
                    (stats["mean"] > 0)
                    & (stats[attr] < np.percentile(stats[attr], 99) * 3)
                ]
                g = sns.jointplot(
                    p["mean"], p[attr], s=1, alpha=0.1, rasterized=True, height=3
                )
                savefig(
                    g.fig,
                    os.path.join(
                        output_dir, "{}.mean_vs_{}.svg".format(output_prefix, attr)
                    ),
                )

[docs]    def plot_raw_coverage(self, samples=None, by_attribute=None):
        """
        Diagnostic plots on the Sample's signal.
        Provides plots with Samples grouped `by_attribute` if given (a string or a list of strings).

        Parameters
        ----------
        samples : :obj:`list`
            List of peppy.Samples objects to use for plotting.

        by_attribute : :obj:`str`, optional
            Attribute of samples to group by.
            Values will be aggregated across samples by that attribute.
        """
        # TODO: get matrix as input, move to graphics
        import matplotlib.pyplot as plt
        import seaborn as sns
        from ngs_toolkit.graphics import savefig

        if samples is None:
            samples = self.samples

        if by_attribute is None:
            cov = pd.melt(
                np.log2(1 + self.matrix_raw[[s.name for s in samples]]),
                var_name="Sample name",
                value_name="Raw counts (log2)",
            )
            fig, axis = plt.subplots(1, 1, figsize=(6, 1 * 4))
            sns.violinplot(
                "Raw counts (log2)",
                "Sample name",
                orient="horizontal",
                palette="tab20",
                data=cov,
                ax=axis,
            )
            sns.despine(fig)
            savefig(
                fig,
                os.path.join(
                    self.results_dir, self.name + ".raw_counts.violinplot.svg"))
        else:
            attrs = set([getattr(s, by_attribute) for s in samples])
            fig, axis = plt.subplots(len(attrs), 1, figsize=(8, len(attrs) * 6))
            for i, attr in enumerate(attrs):
                _LOGGER.info(attr)
                cov = pd.melt(
                    np.log2(
                        1
                        + self.matrix_raw[
                            [
                                s.name
                                for s in samples
                                if getattr(s, by_attribute) == attr
                            ]
                        ]
                    ),
                    var_name="Sample name",
                    value_name="Raw counts (log2)",
                )
                sns.violinplot(
                    "Raw counts (log2)",
                    "Sample name",
                    orient="horizontal",
                    palette="tab20",
                    data=cov,
                    ax=axis[i],
                )
                axis[i].set_title(attr)
                axis[i].set_xticklabels(axis[i].get_xticklabels(), rotation=90)
            sns.despine(fig)
            savefig(
                fig,
                os.path.join(
                    self.results_dir,
                    self.name + ".raw_counts.violinplot.by_{}.svg".format(by_attribute)))

    def plot_coverage(self):
        import matplotlib.pyplot as plt
        import seaborn as sns

        # TODO: add plots for overal genome
        # TODO: add raw counts too

        data = self.matrix_norm.copy()
        # (rewrite to avoid putting them there in the first place)
        variables = ["gene_name", "genomic_region", "chromatin_state"]

        for variable in variables:
            d = (
                data[variable].str.split(",").apply(pd.Series).stack()
            )  # separate comma-delimited fields
            d.index = d.index.droplevel(
                1
            )  # returned a multiindex Series, so get rid of second index level (first is from original row)
            data = data.drop(
                [variable], axis=1
            )  # drop original column so there are no conflicts
            d.name = variable
            data = data.join(d)  # joins on index

        variables = [
            "chrom",
            "start",
            "end",
            "ensembl_transcript_id",
            "distance",
            "ensembl_gene_id",
            "support",
            "mean",
            "variance",
            "std_deviation",
            "dispersion",
            "qv2",
            "amplitude",
            "gene_name",
            "genomic_region",
            "chromatin_state",
        ]
        # Plot
        data_melted = pd.melt(
            data, id_vars=variables, var_name="sample", value_name="norm_counts"
        )

        # transform dispersion
        data_melted["dispersion"] = np.log2(1 + data_melted["dispersion"])

        # Together in same violin plot
        fig, axis = plt.subplots(1)
        sns.violinplot("genomic_region", "norm_counts", data=data_melted, ax=axis)
        fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.per_genomic_region.violinplot.svg",
            ),
            bbox_inches="tight",
        )

        # dispersion
        fig, axis = plt.subplots(1)
        sns.violinplot("genomic_region", "dispersion", data=data_melted, ax=axis)
        fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.dispersion.per_genomic_region.violinplot.svg",
            ),
            bbox_inches="tight",
        )

        # dispersion
        fig, axis = plt.subplots(1)
        sns.violinplot("genomic_region", "qv2", data=data_melted, ax=axis)
        fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.qv2.per_genomic_region.violinplot.svg",
            ),
            bbox_inches="tight",
        )

        fig, axis = plt.subplots(1)
        sns.violinplot("chromatin_state", "norm_counts", data=data_melted, ax=axis)
        fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.chromatin_state.violinplot.svg",
            ),
            bbox_inches="tight",
        )

        fig, axis = plt.subplots(1)
        sns.violinplot("chromatin_state", "dispersion", data=data_melted, ax=axis)
        fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.dispersion.chromatin_state.violinplot.svg",
            ),
            bbox_inches="tight",
        )

        fig, axis = plt.subplots(1)
        sns.violinplot("chromatin_state", "qv2", data=data_melted, ax=axis)
        fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.qv2.chromatin_state.violinplot.svg",
            ),
            bbox_inches="tight",
        )

        # separated by variable in one grid
        g = sns.FacetGrid(data_melted, col="genomic_region", col_wrap=3)
        g.map(sns.distplot, "mean", hist=False, rug=False)
        g.fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.mean.per_genomic_region.distplot.svg",
            ),
            bbox_inches="tight",
        )

        g = sns.FacetGrid(data_melted, col="genomic_region", col_wrap=3)
        g.map(sns.distplot, "dispersion", hist=False, rug=False)
        g.fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.dispersion.per_genomic_region.distplot.svg",
            ),
            bbox_inches="tight",
        )

        g = sns.FacetGrid(data_melted, col="genomic_region", col_wrap=3)
        g.map(sns.distplot, "qv2", hist=False, rug=False)
        g.fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.qv2.per_genomic_region.distplot.svg",
            ),
            bbox_inches="tight",
        )

        g = sns.FacetGrid(data_melted, col="genomic_region", col_wrap=3)
        g.map(sns.distplot, "support", hist=False, rug=False)
        g.fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.support.per_genomic_region.distplot.svg",
            ),
            bbox_inches="tight",
        )

        g = sns.FacetGrid(data_melted, col="chromatin_state", col_wrap=3)
        g.map(sns.distplot, "mean", hist=False, rug=False)
        g.fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.mean.chromatin_state.distplot.svg",
            ),
            bbox_inches="tight",
        )

        g = sns.FacetGrid(data_melted, col="chromatin_state", col_wrap=3)
        g.map(sns.distplot, "dispersion", hist=False, rug=False)
        g.fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.dispersion.chromatin_state.distplot.svg",
            ),
            bbox_inches="tight",
        )

        g = sns.FacetGrid(data_melted, col="chromatin_state", col_wrap=3)
        g.map(sns.distplot, "qv2", hist=False, rug=False)
        g.fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.qv2.chromatin_state.distplot.svg",
            ),
            bbox_inches="tight",
        )

        g = sns.FacetGrid(data_melted, col="chromatin_state", col_wrap=3)
        g.map(sns.distplot, "support", hist=False, rug=False)
        g.fig.savefig(
            os.path.join(
                self.results_dir,
                self.name + ".norm_counts.support.chromatin_state.distplot.svg",
            ),
            bbox_inches="tight",
        )
        plt.close("all")

[docs]    def region_context_enrichment(
        self,
        regions,
        steps=["genomic_region", "chromatin_state"],
        background="region_set",
        prefix="region_type_enrichment",
        output_dir="{results_dir}",
    ):
        """
        Characterize a subset of the regions (e.g. differential regions)
        in terms of their genomic context.

        Parameters
        ----------
        regions : {:obj:`list`, :obj:`pandas.DataFrame`, :obj:`pandas.Index`}
            Subset of regions of interest to analysis.
            Must be a subset of the universe (i.e. ``sites`` attribute).

        steps : :obj:`list`, optional
            Steps of enrichment to perform.
            Defaults to all available: 'genomic_region' and 'chromatin_state'.

        background : :obj:`str`, optional
            Which set to consider as backgroud.
            Options are:

                * "region_set": the consensus region_set of the analysis
                * "genome": a randomized set of size as region_set across the genome

        prefix : :obj:`str`, optional
            Prefix for saved files.

            Default is "region_type_enrichment".
        output_dir : :obj:`str`, optional
            Directory to write results to.
            Can be formatted with Analysis attributes.

            Default is "{results_dir}".

        Returns
        ---------
        pandas.DataFrame
            Enrichment results
        """
        from ngs_toolkit.utils import log_pvalues
        from scipy.stats import fisher_exact

        output_dir = self._format_string_with_attributes(output_dir)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if isinstance(regions, pd.DataFrame):
            regions = regions.index.tolist()
        elif isinstance(regions, pd.Index):
            regions = regions.tolist()
        elif isinstance(regions, list):
            pass
        else:
            msg = "Input not understood."
            _LOGGER.error(msg)
            raise ValueError(msg)

        options = ["region_set", "genome"]
        if background not in options:
            msg = "Option `background` must be one of '{}'.".format(
                "', '".join(options)
            )
            raise ValueError(msg)

        # compare genomic regions and chromatin_states
        enr = list()
        msg = "'{}' step selected, but analysis does not have '{}'."
        msg2 = "'genome' selected, but analysis does not have '{}'."
        for step, matrix, matrix_b in [
            (
                "genomic_region",
                "region_annotation_mapping",
                "region_annotation_b_mapping",
            ),
            (
                "chromatin_state",
                "chrom_state_annotation_mapping",
                "chrom_state_annotation_b_mapping",
            ),
        ]:
            if step not in steps:
                continue
            if not hasattr(self, matrix):
                _LOGGER.warning(msg.format(step, matrix))
                continue

            _LOGGER.debug("Getting enrichment of regions in '{}'.".format(step))

            # Count foreground occurences
            annot = getattr(self, matrix)
            res = annot.loc[set(regions), step].value_counts().to_frame("foreground")

            # Count background occurences
            # # in case of background == 'genome', we simply replace the dataframes
            if background == "genome":
                try:
                    annot = getattr(self, matrix_b)
                except AttributeError:
                    _LOGGER.warning(msg2.format(matrix_b))
                    continue
            # # account for foreground regions not annotated
            res_b = annot.loc[:, step].value_counts().to_frame(name="universe")
            if not all([x in res_b.index for x in res.index]):
                m = res.index[~res.index.isin(res_b.index)]
                msg3 = "Foreground regions contains type of {} not in background: {}".format(
                    step, "', '".join(m)
                )
                msg3 += " Continuing without those."
                _LOGGER.warning(msg3)
            res = res.reindex(res_b.index)

            # # join
            res = res.join(res_b, how="outer").fillna(0).astype(int)

            # Calculate log fold enrichment:
            # # normalize to total:
            res.loc[:, "foreground_fraction"] = (
                res["foreground"] / res["foreground"].sum()
            )
            res.loc[:, "universe_fraction"] = res["universe"] / res["universe"].sum()
            res.loc[:, "log2_fold_change"] = np.log2(
                res["foreground_fraction"] / res["universe_fraction"]
            )
            # Calculate overlap p-value:
            for feature in res["foreground"].index:
                a = res.loc[feature, "foreground"]
                b = res.loc[:, "foreground"].drop(feature).sum()
                c = annot.loc[(~annot.index.isin(regions)), step].value_counts()[
                    feature
                ]
                d = (
                    annot.loc[(~annot.index.isin(regions)), step]
                    .value_counts()
                    .drop(feature)
                    .sum()
                )
                res.loc[feature, "odds_ratio"], res.loc[
                    feature, "p_value"
                ] = fisher_exact([[a, c], [b, d]], alternative="two-sided")
            res.loc[:, "log2_odds_ratio"] = np.log2(res["odds_ratio"])
            res.loc[:, "-log10(p-value)"] = log_pvalues(res["p_value"])
            res.loc[:, "region_type"] = step
            # Append
            enr.append(res)

        # save
        enr = pd.concat(enr)
        enr.index.name = "region"
        enr.to_csv(os.path.join(output_dir, prefix + ".csv"), index=True)
        return enr

[docs]    def characterize_regions_function(
        self,
        differential,
        output_dir,
        prefix,
        universe_file=None,
        run=True,
        genome=None,
        steps=["region", "lola", "meme", "homer", "enrichr"],
    ):
        """
        Performs a range of functional enrichments of a set of regions
        given in ``differential`` (a dataframe which is typically a subset
        of an annotated coverage dataframe). Will extract regions,
        their underlying sequence, associated genes, perform enrichment of
        genomic regions, chromatin states against a background, motif enrichment,
        location overlap analysis (LOLA), and gene set enrichment (using the Enrichr API).

        This requires several programs and R libraries:
            - MEME suite (AME)
            - HOMER suite (findMotifsGenome.pl)
            - LOLA (R library)

        Additionally, some genome-specific databases are needed
        to run these programs.

        Parameters
        ----------
        differential : :obj:`pandas.DataFrame`
            Results of differential analysis for a given comparison of interest.

        output_dir : :obj:`str`
            Directory to output results to.

        prefix : :obj:`str`
            Prefix to use for output files.

        universe_file : :obj:`str`, optional
            Path to BED file with set of universe regions where
            differential were selected from.

            Default is ``sites`` attribute of Analysis.
        run: :obj:`bool`, optional
            Whether to run enrichment commands now or to simply
            prepare the input files for it.
            Default is :obj:`True`.

        genome : :obj:`str`, optional
            Genome assembly of analysis.
            Default is genome genome assembly of analysis (``genome`` attribute).

        steps : :obj:`list`, optional
            Which steps of the analysis to perform.
            Default is all: ['region', 'lola', 'meme', 'homer', 'enrichr'].
        """
        from ngs_toolkit.general import meme_ame, homer_motifs, lola, enrichr
        from ngs_toolkit.utils import bed_to_fasta, standard_score
        from ngs_toolkit.utils import location_index_to_bed

        # use all sites as universe
        if universe_file is None:
            try:
                universe_file = getattr(self, "sites").fn
                _LOGGER.info(
                    "Using default background region set from 'analysis.sites': {}'.".format(
                        universe_file
                    )
                )
            except AttributeError as e:
                _LOGGER.error(
                    "Background region set 'analysis.sites' is not set! Cannot run LOLA!"
                )
                raise e

        if genome is None:
            genome = self.genome

        # make output dirs
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        bed = location_index_to_bed(differential.index)
        # save to bed
        bed_file = os.path.join(output_dir, "{}_regions.bed".format(prefix))
        bed.to_csv(bed_file, sep="\t", header=False, index=False)
        # save as tsv
        tsv_file = os.path.join(output_dir, "{}_regions.tsv".format(prefix))
        bed.reset_index().to_csv(tsv_file, sep="\t", header=False, index=False)

        # export gene names
        clean_gene = (
            differential["gene_name"]
            .str.split(",")
            .apply(pd.Series, 1)
            .stack()
            .drop_duplicates()
        )
        clean_gene = clean_gene[~clean_gene.isin([".", "nan", ""])]
        clean_gene.to_csv(
            os.path.join(output_dir, "{}.gene_symbols.txt".format(prefix)),
            header=False,
            index=False,
        )
        if "ensembl_gene_id" in differential.columns:
            # export ensembl gene names
            clean = (
                differential["ensembl_gene_id"]
                .str.split(",")
                .apply(pd.Series, 1)
                .stack()
                .drop_duplicates()
            )
            clean.to_csv(
                os.path.join(output_dir, "{}_genes.ensembl.txt".format(prefix)),
                header=False,
                index=False,
            )

        # export gene symbols with scaled absolute fold change
        if "log2FoldChange" in differential.columns:
            differential["score"] = standard_score(abs(differential["log2FoldChange"]))
            differential["abs_fc"] = abs(differential["log2FoldChange"])

            d = differential[["gene_name", "score"]].sort_values(
                "score", ascending=False
            )

            # split gene names from score if a reg.element was assigned to more than one gene
            a = d["gene_name"].str.split(",").apply(pd.Series, 1).stack()
            a.index = a.index.droplevel(1)
            a.name = "gene_name"
            d = d[["score"]].join(a)
            # reduce various ranks to mean per gene
            d = d.groupby("gene_name").mean().reset_index()
            d.to_csv(
                os.path.join(output_dir, "{}.gene_symbols.score.csv".format(prefix)),
                index=False,
            )

        # get fasta file with sequence underlying region
        if ("meme" in steps) or ("homer" in steps):
            hint = " Will not do motif enrichment analysis."
            fasta_file = os.path.join(output_dir, "{}_regions.fa".format(prefix))

            resources = self.get_resources(steps=["genome"])["genome_file"]
            if "fasta" not in resources:
                reason = (
                    "Could not get genome sequence file in either FASTA or 2bit format."
                )
                _LOGGER.warning(reason + hint)
            else:
                try:
                    bed_to_fasta(
                        input_bed=bed_file,
                        output_fasta=fasta_file,
                        genome_file=resources["fasta"],
                    )
                except EnvironmentError:
                    reason = "Could not get FASTA sequence for regions."
                    _LOGGER.warning(reason + hint)

        if not run:
            return

        if "region" in steps:
            self.region_context_enrichment(differential, output_dir=output_dir)

        # MEME
        if "meme" in steps:
            _LOGGER.info("Running MEME-AME for '{}'".format(prefix))
            omap = {"hg38": "human", "hg19": "human", "mm10": "mouse"}
            meme_ame(fasta_file, output_dir, organism=omap[genome])

        # HOMER
        if "homer" in steps:
            _LOGGER.info("Running HOMER for '{}'".format(prefix))
            homer_motifs(bed_file, output_dir, genome=genome)

        # LOLA
        if "lola" in steps:
            _LOGGER.info("Running LOLA for '{}'".format(prefix))
            try:
                lola(bed_file, universe_file, output_dir, genome=genome)
            except:
                _LOGGER.error("LOLA analysis for '{}' failed!".format(prefix))

        # Enrichr
        if "enrichr" in steps:
            _LOGGER.info("Running Enrichr for '{}'".format(prefix))
            results = enrichr(clean_gene.to_frame(name="gene_name"))
            results.to_csv(
                os.path.join(output_dir, "{}.enrichr.csv".format(prefix)),
                index=False,
                encoding="utf-8",
            )