Source code for ngs_toolkit.cnv

#!/usr/bin/env python


import os

import numpy as np
import pandas as pd

from ngs_toolkit import _LOGGER
from ngs_toolkit.analysis import Analysis
from ngs_toolkit.decorators import check_has_attributes
from ngs_toolkit.utils import warn_or_raise

from ngs_toolkit.demo.data_generator import DEFAULT_CNV_RESOLUTIONS


[docs]class CNVAnalysis(Analysis):
    """
    Class to model analysis of CNV data.
    Inherits from the :class:`~ngs_toolkit.analysis.Analysis` class.

    Parameters
    ----------
    name : :obj:`str`, optional
        Name of the analysis.

        Defaults to "analysis".
    from_pep : :obj:`str`, optional
        PEP configuration file to initialize analysis from.
        The analysis will adopt as much attributes from the PEP as possible
        but keyword arguments passed at initialization will still have priority.

        Defaults to :obj:`None` (no PEP used).
    from_pickle : :obj:`str`, optional
        Pickle file of an existing serialized analysis object
        from which the analysis should be loaded.

        Defaults to :obj:`None` (will not load from pickle).
    root_dir : :obj:`str`, optional
        Base directory for the project.

        Defaults to current directory or to what is specified in PEP if :attr:`~ngs_toolkit.analysis.Analysis.from_pep`.
    data_dir : :obj:`str`, optional
        Directory containing processed data (e.g. by looper) that will
        be input to the analysis. This is in principle not required.

        Defaults to "data".
    results_dir : :obj:`str`, optional
        Directory to contain outputs produced by the analysis.

        Defaults to "results".
    prj : :class:`peppy.Project`, optional
        A :class:`peppy.Project` object that this analysis is tied to.

        Defaults to :obj:`None`.
    samples : :obj:`list`, optional
        List of :class:`peppy.Sample` objects that this analysis is tied to.

        Defaults to :obj:`None`.
    kwargs : :obj:`dict`, optional
        Additional keyword arguments will be passed to parent class :class:`~ngs_toolkit.analysis.Analysis`.

    Examples
    --------
    >>> from ngs_toolkit.cnv import CNVAnalysis

    This is an example of a CNV analysis:

    >>> pep = "metadata/project_config.yaml"
    >>> a = CNVAnalysis(from_pep=pep)
    >>> # Get consensus peak set from all samples
    >>> a.get_cnv_data()
    >>> # Normalize
    >>> a.normalize(method="median")
    >>> # Segmentation
    >>> a.segment_genome()
    >>> # General plots
    >>> a.plot_all_data()
    >>> a.plot_segmentation_stats()
    >>> # Unsupervised analysis
    >>> a.unsupervised_analysis()
    >>> # Save object
    >>> a.to_pickle()
    """
    _data_type = "CNV"

    def __init__(
        self,
        name=None,
        from_pep=False,
        from_pickle=False,
        root_dir=None,
        data_dir="data",
        results_dir="results",
        prj=None,
        samples=None,
        **kwargs
    ):
        # The check for existance is to make sure other classes can inherit from this
        default_args = {
            "data_type": "CNV",
            "__data_type__": "CNV",
            "var_unit_name": "bin",
            "quantity": "copy_number",
            "norm_units": "log2(ratio)"}
        for k, v in default_args.items():
            if not hasattr(self, k):
                setattr(self, k, v)

        super(CNVAnalysis, self).__init__(
            name=name,
            from_pep=from_pep,
            from_pickle=from_pickle,
            root_dir=root_dir,
            data_dir=data_dir,
            results_dir=results_dir,
            prj=prj,
            samples=samples,
            **kwargs
        )

        if not hasattr(self, "resolutions"):
            self.resolutions = DEFAULT_CNV_RESOLUTIONS

[docs]    def load_data(
        self,
        output_map=None,
        only_these_keys=None,
        resolutions=None,
        prefix="{results_dir}/{name}",
        permissive=True,
    ):
        """
        Load the output files of the major functions of the Analysis.

        Parameters
        ----------
        output_map : :obj:`dict`
            Dictionary with {attribute_name: (file_path, kwargs)} to load the files.
            The kwargs in the tuple will be passed to :meth:`pandas.read_csv`.

            Defaults to the required to read the keys in ``only_these_keys``.
        only_these_keys : :obj:`list`, optional
            Iterable of analysis attributes to load up.
            Possible attributes:

                * "matrix_raw"
                * "matrix_norm"
                * "matrix_features"
                * "differential_results"

            Defaults to all of the above.

        resolutions: :obj:`list`
            List of resolution strings to get data for.

            Defaults to value of ``resolutions`` attribute of Analysis.
        prefix : :obj:`str`, optional
            String prefix of files to load.
            Variables in curly braces will be formated with attributes of analysis.

            Defaults to "{results_dir}/{name}".
        permissive : :obj:`bool`, optional
            Whether an error should be ignored if reading a file causes IOError.

            Default is :obj:`True`.

        Attributes
        ----------
        pandas.DataFrame
            Dataframes holding the respective data, available as attributes described
            in the `only_these_keys` parameter.

        Raises
        ----------
        IOError
            If not permissive and a file is not found
        """
        from ngs_toolkit.utils import fix_dataframe_header

        prefix = self._format_string_with_attributes(prefix)

        if resolutions is None:
            resolutions = self.resolutions

        if output_map is None:
            kwargs = {"index_col": 0}
            output_map = {
                "matrix_raw": {r: (prefix + ".{}.matrix_raw.csv".format(r), kwargs) for r in resolutions},
                "matrix_norm": {r: (prefix + ".{}.matrix_norm.csv".format(r), kwargs) for r in resolutions},
                "segmentation": {r: (prefix + ".{}.segmentation.csv".format(r), {}) for r in resolutions},
                "segmentation_annot": {r: (prefix + ".{}.segmentation.annotated.csv".format(r), {}) for r in resolutions}
            }
        if only_these_keys is None:
            only_these_keys = list(output_map.keys())

        output_map = {k: v for k, v in output_map.items() if k in only_these_keys}

        for name, f in output_map.items():
            for resolution, (file, kwargs) in f.items():
                file = file.format(resolution)
                _LOGGER.info(
                    "Loading '{}' analysis attribute for resolution '{}'."
                    .format(name, resolution))
                if not hasattr(self, name):
                    setattr(self, name, {resolution: None})
                try:
                    getattr(self, name)[resolution] = pd.read_csv(file, **kwargs)
                    # Fix possible multiindex for matrix_norm
                    if name == "matrix_norm":
                        getattr(self, name)[resolution] = fix_dataframe_header(
                            getattr(self, name)[resolution]
                        )
                except IOError as e:
                    if not permissive:
                        raise e
                    else:
                        _LOGGER.warning(e)

    def _copy_cnv_profile_plots(
            self, output_dir="{results_dir}/cnv_profiles",
            output_prefix="log2_profile",
            resolutions=None, samples=None, permissive=True
    ):
        """
        Convenience to copy output plots from runnning several samples independently
        to a given directory.

        Parameters
        ----------
        output_dir : :obj:`str`, optional
            Directory to copy to.

            Defaults to "{results_dir}/cnv_profiles".
        output_prefix : :obj:`str`, optional
            Prefix for copied files.

            Defaults to "log2_profile".
        resolutions : :obj:`list`, optional
            Resolutions of analysis.

            Defaults to resolutions in Analysis object.
        samples : :obj:`list`, optional
            Samples to restrict analysis to.

            Defaults to samples in Analysis object.
        permissive: :obj:`bool`, optional
            Whether missing files should raise an error.

            Defaults to :obj:`True.`
        """
        from tqdm import tqdm
        from glob import glob
        from shutil import copyfile

        if resolutions is None:
            resolutions = self.resolutions

        if samples is None:
            samples = self.samples

        output_dir = self._format_string_with_attributes(output_dir)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for resolution in tqdm(resolutions, total=len(resolutions), desc="Resolution"):

            for sample in tqdm(samples, total=len(samples), desc="Sample"):
                # Read log2 file
                if not hasattr(sample, "log2_read_counts"):
                    sample.log2_read_counts = os.path.join(
                        self.data_dir,
                        sample.paths.sample_root, sample.name + "_{resolution}",
                        "CNAprofiles",
                        "log2_read_counts.igv")
                if "{resolution}" in sample.log2_read_counts:
                    input_file = sample.log2_read_counts.format(resolution=resolution)
                f = glob(input_file)
                if len(f) == 1:
                    f = f[0]
                else:
                    msg = "Sample '{}' does not have a PDF file!".format(sample.name)
                    if permissive:
                        _LOGGER.warning(msg)
                        continue
                    else:
                        raise OSError(msg)

                d = os.path.join(output_dir, sample.name + "." + resolution + "." + output_prefix + ".pdf")
                try:
                    copyfile(f, d)
                except OSError:
                    msg = "Could not copy file '{}' to '{}'!".format(f, d)
                    if permissive:
                        _LOGGER.warning(msg)
                    else:
                        raise OSError(msg)

[docs]    def get_cnv_data(
        self, resolutions=None, samples=None, save=True, assign=True, permissive=False
    ):
        """
        Load CNV data from ATAC-seq CNV pipeline and create CNV matrix at various resolutions.

        Parameters
        ----------
        resolutions : :obj:`list`, optional
            Resolutions of analysis.

            Defaults to resolutions in Analysis object.
        samples : :obj:`list`, optional
            Samples to restrict analysis to.

            Defaults to samples in Analysis object.
        save: :obj:`bool`, optional
            Whether results should be saved to disc.

            Defaults to :obj:`True`
        assign: :obj:`bool`, optional
            Whether results should be assigned to an attribute in the Analsyis object.

            Defaults to :obj:`True`
        permissive: :obj:`bool`, optional
            Whether missing files should be allowed.

            Defaults to :obj:`False`

        Returns
        -------
        dict
            Dictionary with CNV matrices for each resolution.

        Raises
        -------
        IOError
            If not permissive and input files can't be read.

        Attributes
        ----------
        matrix : :obj:`dict`
            Sets a `matrix` dictionary with CNV matrices for each resolution.
        """
        # TODO: figure out a way of having the input file specified before hand
        from tqdm import tqdm
        from ngs_toolkit.utils import bed_to_index

        if resolutions is None:
            resolutions = self.resolutions

        if samples is None:
            samples = self.samples

        matrix_raw = dict()

        for resolution in tqdm(resolutions, total=len(resolutions), desc="Resolution"):
            matrix_raw[resolution] = pd.DataFrame()

            for sample in tqdm(samples, total=len(samples), desc="Sample"):
                # Read log2 file
                if not hasattr(sample, "log2_read_counts"):
                    msg = "Sample does not have a 'log2_read_counts' attribute."
                    warn_or_raise(AttributeError(msg), permissive)

                input_file = sample.log2_read_counts[resolution].format(resolution=resolution)
                try:
                    cov = pd.read_csv(
                        input_file, sep="\t", comment="#"
                    ).set_index("Feature")
                except IOError as e:
                    e = IOError(
                        "Sample %s does not have a 'log2_read_counts' file: '%s'." %
                        (sample.name, input_file))
                    warn_or_raise(e, permissive)

                # TODO: this is specific to CopyWriter, should be removed later
                # and probably replaced with the column position
                cov.columns = (
                    cov.columns.str.replace("log2.", "")
                    .str.replace(".trimmed.bowtie2.filtered.bam", "")
                    .str.replace(".merged.sorted.subsample.bam", "")
                )

                # normalize signal to control
                # # TODO: check whether there was a reason I was previously
                # # undoing and redoing the log
                # matrix_raw[resolution][sample.name] = np.log2(
                #     (
                #         (0.1 + (2 ** cov.loc[:, sample.name]))
                #         / (0.1 + (2 ** cov.iloc[:, -1]))
                #     )
                # )
                matrix_raw[resolution][sample.name] = cov.loc[:, sample.name] - cov.iloc[:, -1]
            if "cov" not in locals():
                msg = "None of the samples had a valid 'log2_read_counts' file."
                _LOGGER.error(msg)
                raise ValueError(msg)

            c = cov.columns.tolist()
            c[:3] = ["chrom", "start", "end"]
            cov.columns = c
            matrix_raw[resolution].index = bed_to_index(cov)

            if save:
                matrix_raw[resolution].to_csv(
                    os.path.join(
                        self.results_dir,
                        self.name + ".{}.matrix_raw.csv".format(resolution),
                    ),
                    index=True,
                )

        if assign:
            self.matrix_raw = matrix_raw

        return matrix_raw

[docs]    def normalize(
        self,
        method="median",
        matrix="matrix_raw",
        samples=None,
        save=True,
        assign=True,
        **kwargs
    ):
        """
        Normalization of dictionary of matrices with (n_features, n_samples).

        Parameters
        ----------
        resolutions : :obj:`list`, optional
            Resolutions of analysis.

            Defaults to resolutions in Analysis object.
        method : :obj:`str`
            Normalization method to apply.

            Defaults to "median".
        matrix : :obj:`str`, optional
            Attribute name of dictionary of matrices to normalize.

            Defaults to `matrix_raw`.
        samples : :obj:`list`
            Iterable of peppy.Sample objects to restrict matrix to.

            Defaults to all in analysis.
        save: :obj:`bool`, optional
            Whether results should be saved to disc.

            Defaults to :obj:`True`
        assign: :obj:`bool`, optional
            Whether results should be assigned to an attribute in the Analsyis object.

            Defaults to :obj:`True`
        kwargs : :obj:`dict`, optional
            Additional kwargs are passed to the respective normalization method.

        Returns
        -------
        dict
            Dictionary with normalized CNV matrices for each resolution.

        Attributes
        ----------
        matrix_norm : :obj:`dict`
            Sets a `matrix_norm` dictionary with CNV matrices for each resolution.
        """
        matrix_norm = dict()
        if "resolutions" not in kwargs:
            resolutions = self.resolutions
        else:
            resolutions = kwargs['resolutions']

        if matrix is None:
            matrix = self.matrix_raw
        elif isinstance(matrix, str):
            matrix = getattr(self, matrix)

        for resolution in resolutions:
            if method == "median":
                matrix_norm[resolution] = self.normalize_median(
                    matrix=matrix[resolution], samples=samples, save=False, assign=False
                )
            elif method == "pca":
                if "pc" not in kwargs:
                    raise ValueError(
                        "If method is 'pca', the value of 'pc' must be given."
                    )
                matrix_norm[resolution] = self.normalize_pca(
                    matrix=matrix[resolution],
                    samples=samples,
                    pc=kwargs["pc"],
                    save=False,
                    assign=False,
                )
            else:
                raise ValueError("Requested method '{}' is not known.".format(method))

            if save:
                matrix_norm[resolution].to_csv(
                    os.path.join(
                        self.results_dir, self.name + ".{}.matrix_norm.csv".format(resolution)
                    ),
                    index=True,
                )
        if assign:
            self.matrix_norm = matrix_norm
        return matrix_norm

[docs]    def plot_all_data(
        self,
        matrix="matrix_norm",
        resolutions=None,
        samples=None,
        output_dir=None,
        output_prefix="all_data",
        sample_labels=True,
        **kwargs
    ):
        """
        Visualize CNV data genome-wide using heatmaps.
        Will be done independently for each specified resolution.

        Parameters
        ----------
        matrix : :obj:`str`, optional
            Attribute name of dictionary of matrices to normalize.

            Defaults to `matrix_norm`.
        resolutions : :obj:`list`, optional
            Resolutions of analysis.

            Defaults to resolutions in Analysis object.
        samples : :obj:`list`, optional
            Samples to restrict analysis to.

            Defaults to samples in Analysis object.
        output_dir : :obj:`str`, optional
            Output directory.

            Defaults to Analysis results directory.
        output_prefix : :obj:`str`, optional
            Prefix to add to plots.

            Defaults to "{analysis_name}.all_data"
        sample_labels : :obj:`bool`, optional
            Whether to label samples with their name.

            Defaults to :obj:`True`
        **kwargs : :obj:`dict`
            Additional kwargs will be passed to `seaborn.clustermap`.
        """
        # TODO: add support for group colours
        from tqdm import tqdm
        import matplotlib.pyplot as plt
        import seaborn as sns
        from ngs_toolkit.graphics import savefig

        matrix = self.get_matrix(matrix)
        if resolutions is None:
            resolutions = self.resolutions
        if samples is None:
            samples = self.samples
        if output_dir is None:
            output_dir = self.results_dir
        names = [s.name for s in samples]

        # Plot mean and variationper chromosome
        for resolution in tqdm(resolutions, desc="Resolution"):
            r_names = [n for n in names if n in matrix[resolution].columns]
            p = os.path.join(
                self.results_dir, self.name
                + ".{}.{}.full_data.".format(resolution, output_prefix))
            # Plot all data
            kws = dict(
                cmap="RdBu_r", robust=True,
                rasterized=True,
                xticklabels=False,
                yticklabels=r_names if sample_labels else False,
                cbar_kws={"label": "log2(change)"})
            kws.update(kwargs)

            fig, axis = plt.subplots(1, 1, figsize=(4 * 2, 4 * 1))
            sns.heatmap(
                matrix[resolution].T,
                center=0, ax=axis, **kws
            )
            axis.set_xlabel("Chromosome position")
            axis.set_ylabel("Sample")
            savefig(fig, p + "heatmap.svg")

            grid = sns.clustermap(
                matrix[resolution].fillna(0).T,
                metric="correlation", center=0,
                col_cluster=False, figsize=(4 * 2, 4 * 1), **kws
            )
            grid.ax_heatmap.set_xlabel("Chromosome position")
            grid.ax_heatmap.set_ylabel("Sample")
            savefig(grid, p + "fillna.clustermap.svg")

[docs]    def plot_stats_per_chromosome(
        self,
        matrix="matrix_norm",
        resolutions=None,
        samples=None,
        output_dir="{results_dir}",
        output_prefix="all_data",
        sample_labels=True,
        **kwargs
    ):
        """
        Visualize mean and variation of CNV data for each chromosome using heatmaps.
        Will be done independently for each specified resolution.
        Will also be done once for all chromosomes and another time without sex chromosomes.

        Parameters
        ----------
        matrix : :obj:`str`, optional
            Attribute name of dictionary of matrices to normalize.

            Defaults to `matrix_norm`.
        resolutions : :obj:`list`, optional
            Resolutions of analysis.

            Defaults to resolutions in Analysis object.
        samples : :obj:`list`, optional
            Samples to restrict analysis to.

            Defaults to samples in Analysis object.
        output_dir : :obj:`str`, optional
            Output directory.

            Defaults to Analysis results directory.
        output_prefix : :obj:`str`, optional
            Prefix to add to plots.

            Defaults to "{analysis_name}.all_data"
        sample_labels: :obj:`bool`, optional
            Whether to label samples with their name.

            Defaults to :obj:`True`
        **kwargs : :obj:`dict`
            Additional kwargs will be passed to `seaborn.clustermap`.
        """
        import seaborn as sns
        from tqdm import tqdm
        from ngs_toolkit.graphics import (
            clustermap_fix_label_orientation, savefig)

        matrix = self.get_matrix(matrix)
        if resolutions is None:
            resolutions = self.resolutions
        if samples is None:
            samples = self.samples
        output_dir = self._format_string_with_attributes(output_dir)
        if "{analysis_name}" in output_prefix:
            output_prefix = output_prefix.format(analysis_name=self.name)

        # Plot mean and variation per chromosome
        for resolution in tqdm(resolutions, desc="Resolution"):
            names = [s.name for s in samples if s.name in matrix[resolution].columns]
            to_plot = matrix[resolution].loc[:, names]

            to_plot["chr"] = list(map(lambda x: x[0], to_plot.index.str.split(":")))

            for label, function in tqdm(
                [("variation", np.std), ("mean", np.mean)], desc="metric"
            ):
                prefix = os.path.join(
                    output_dir, self.name
                    + ".{}.".format(resolution) + output_prefix + ".{}".format(label))
                kws = {"yticklabels": False, "xticklabels": sample_labels}
                kws.update(kwargs)

                p = to_plot[names + ["chr"]].groupby("chr").apply(function)
                grid = sns.clustermap(
                    p + p.min().min(),
                    cbar_kws={"label": label},
                    cmap="Greens", **kws)
                clustermap_fix_label_orientation(grid)
                savefig(grid.fig, prefix + "_per_chrom.svg")

                p = (
                    to_plot.loc[~to_plot["chr"].str.contains("X|Y"), names + ["chr"]]
                    .groupby("chr")
                    .apply(function))
                grid = sns.clustermap(
                    p + abs(p.min().min()),
                    cbar_kws={"label": label},
                    cmap="Greens", **kws)
                clustermap_fix_label_orientation(grid)
                savefig(grid.fig, prefix + "_per_chrom.no_sex_chroms.svg")

                grid = sns.clustermap(
                    p,
                    cbar_kws={"label": label + " (Z-score)"},
                    cmap="RdBu_r",
                    center=0,
                    z_score=1, **kws)
                clustermap_fix_label_orientation(grid)
                savefig(grid.fig, prefix + "_per_chrom.no_sex_chroms.zscore.svg")

[docs]    def segment_genome(
        self,
        matrix="matrix_norm",
        resolutions=None,
        samples=None,
        save=True,
        assign=True,
    ):
        """
        Segment CNV data to create calls of significant deviations.
        Will be done independently for each specified resolution.

        Requires the R package "DNAcopy" to be installed:
            >>> source('http://bioconductor.org/biocLite.R')
            >>> biocLite('DNAcopy')

        Parameters
        ----------
        matrix : :obj:`str`, optional
            Attribute name of dictionary of matrices to segment.

            Defaults to `matrix_norm`.
        resolutions : :obj:`list`, optional
            Resolutions of analysis.

            Defaults to resolutions in Analysis object.
        samples : :obj:`list`, optional
            Samples to restrict analysis to.

            Defaults to samples in Analysis object.
        save: :obj:`bool`, optional
            Whether results should be saved to disc.

            Defaults to :obj:`True`
        assign: :obj:`bool`, optional
            Whether results should be assigned to an attribute in the Analsyis object.

            Defaults to :obj:`True`

        Returns
        -------
        dict
            Dictionary with segmentation for each resolution.

        Attributes
        ----------
        segmentation : :obj:`dict`
            Dictionary with CNV matrices for each resolution.
        """
        # TODO: implement distributed mode
        import warnings

        from tqdm import tqdm
        import rpy2
        from rpy2.rinterface import RRuntimeWarning
        from rpy2.robjects import numpy2ri, pandas2ri
        from rpy2.robjects.packages import STAP

        warnings.filterwarnings("ignore", category=RRuntimeWarning)
        numpy2ri.activate()
        pandas2ri.activate()

        DNAcopy = rpy2.robjects.packages.importr("DNAcopy")
        if matrix is None:
            matrix = self.matrix_norm
        elif isinstance(matrix, str):
            matrix = getattr(self, matrix)
        if resolutions is None:
            resolutions = self.resolutions
        if samples is None:
            samples = self.samples

        segmentation = dict()
        for resolution in tqdm(resolutions, desc="Resolution"):
            chrom = np.array(
                list(map(lambda x: x[0], matrix[resolution].index.str.split(":")))
            )
            start = np.array(
                list(map(
                    lambda x: int(x[1].split("-")[0]),
                    matrix[resolution].index.str.split(":"),
                ))
            )
            names = [s.name for s in samples if s.name in matrix[resolution].columns]
            df = matrix[resolution].reindex(names, axis=1)

            code = """
            run = function(df, chrom, start, names){
                return(
                    DNAcopy::segment(
                        DNAcopy::CNA(
                            df, chrom=chrom, maploc=start, sampleid=names)))
            }
            """
            run = STAP(code, name="run").run
            res = run(df, chrom, start, names)
            seg = DNAcopy.segments_p(res)
            summary = DNAcopy.segments_summary(res)
            segmentation[resolution] = seg.merge(summary)
            segmentation[resolution].columns = [
                "sample_name",
                "chrom",
                "start",
                "end",
                "bin_size",
                "segment_mean",
                "B_stat",
                "p_value",
                "lcl",
                "ucl",
                "segment_std",
                "segment_median",
                "segment_mad",
            ]
            segmentation[resolution]["segment_length"] = (
                segmentation[resolution]["end"] - segmentation[resolution]["start"]
            ) + int(resolution.replace("kb", "")) * 1000

            if save:
                segmentation[resolution].to_csv(
                    os.path.join(
                        self.results_dir,
                        self.name + ".{}.segmentation.csv".format(resolution),
                    ),
                    index=False,
                )
        if assign:
            self.segmentation = segmentation

        return segmentation

[docs]    @check_has_attributes(['organism', 'genome'])
    def annotate_with_chrom_bands(
        self, segmentation=None, resolutions=None, save=True, assign=True
    ):
        """
        Annotate segmentation with chromosome bands and overlaping genes.
        Will be done independently for each specified resolution.

        Parameters
        ----------
        segmentation : :obj:`str`, optional
            Attribute name of dictionary of segmentation results.

            Defaults to `segmentation`.
        resolutions : :obj:`list`, optional
            Resolutions of analysis.

            Defaults to resolutions in Analysis object.
        samples : :obj:`list`, optional
            Samples to restrict analysis to.

            Defaults to samples in Analysis object.
        save: :obj:`bool`, optional
            Whether results should be saved to disc.

            Defaults to :obj:`True`
        assign: :obj:`bool`, optional
            Whether results should be assigned to an attribute in the Analsyis object.

            Defaults to :obj:`True`

        Returns
        -------
        dict
            Dictionary with annotated segmentation for each resolution.

        Attributes
        ----------
        segmentation_annot : :obj:`dict`
            Dictionary with CNV matrices for each resolution.
        """
        from ngs_toolkit.general import query_biomart
        import pybedtools

        if resolutions is None:
            resolutions = self.resolutions

        if segmentation is None:
            segmentation = self.segmentation

        organisms = {
            "human": {"species": "hsapiens", "ensembl_version": "grch37"},
            "mouse": {"species": "mmusculus", "ensembl_version": "grcm38"},
            "yeast": {"species": "scerevisiae", "ensembl_version": "R64"}}

        annotation = query_biomart(
            attributes=[
                "chromosome_name",
                "start_position",
                "end_position",
                "external_gene_name",
                "band"],
            species=organisms[self.organism]["species"],
            ensembl_version=organisms[self.organism]["ensembl_version"])

        annotation["chromosome_name"] = "chr" + annotation["chromosome_name"]
        annotation = annotation[~annotation["chromosome_name"].str.contains("_")]
        annotation["band"] = annotation["chromosome_name"] + "_" + annotation["band"]
        annotation_bed = pybedtools.BedTool.from_dataframe(annotation.fillna("NA"))

        segmentation_annot = dict()
        for resolution in resolutions:
            seg = segmentation[resolution]
            seg_bed = pybedtools.BedTool.from_dataframe(seg[["chrom", "start", "end"]])

            inter = seg_bed.intersect(annotation_bed, wa=True, wb=True).to_dataframe()
            annot = inter.groupby(["chrom", "start", "end"])["thickStart"].apply(
                lambda x: str.join(' ', set(x))
            )
            annot_band = inter.groupby(["chrom", "start", "end"])["thickEnd"].apply(
                lambda x: str.join(' ', set(x))
            )
            annot_band.name = "chromosome_band"
            annot = annot.to_frame(name="gene").join(annot_band)

            seg_annot = (
                seg.set_index(["chrom", "start", "end"]).join(annot).reset_index()
            )
            segmentation_annot[resolution] = seg_annot.reindex(
                seg.columns.tolist() + ["chromosome_band", "gene"], axis=1
            )

            if save:
                segmentation_annot[resolution].to_csv(
                    os.path.join(
                        self.results_dir,
                        self.name + ".{}.segmentation.annotated.csv".format(resolution),
                    ),
                    index=False,
                )

        if assign:
            self.segmentation_annot = segmentation_annot

[docs]    def plot_segmentation_stats(
        self,
        segmentation=None,
        resolutions=None,
        per_sample=False,
        output_dir="{results_dir}/segmentation",
        output_prefix="{resolution}.segmentation_metrics",
    ):
        """
        Visualize distribution of statistics of CNV data segmentation.
        Will be done independently for each specified resolution.

        Parameters
        ----------
        segmentation : :obj:`str`, optional
            Dictionary of segmentation results.

            Defaults to "segmentation".
        resolutions : :obj:`list`, optional
            Resolutions of analysis.

            Defaults to resolutions in Analysis object.
        per_sample: :obj:`bool`, optional
            Whether plots should be made for each sample too.

            Defaults to :obj:`False`
        output_dir : :obj:`str`, optional
            Output directory.

        output_prefix : :obj:`str`, optional
            Prefix to add to plots.

            Defaults to "{resolution}.segmentation_metrics"
        """
        # TODO: plot stats per sample too
        from ngs_toolkit.utils import log_p_value
        import seaborn as sns
        from ngs_toolkit.graphics import savefig

        if segmentation is None:
            segmentation = self.segmentation

        if resolutions is None:
            resolutions = self.resolutions

        output_dir = self._format_string_with_attributes(output_dir)

        metric_vars = [
            "bin_size",
            "segment_mean",
            "B_stat",
            "p_value",
            "lcl",
            "ucl",
            "segment_std",
            "segment_median",
            "segment_mad",
            "segment_length",
        ]

        for resolution in resolutions:
            segments = segmentation[resolution]
            segments["log_p_value"] = log_p_value(segments["p_value"])

            grid = sns.pairplot(
                segments[metric_vars + ["log_p_value"]].dropna(),
                vars=metric_vars + ["log_p_value"],
                plot_kws=dict(s=3, alpha=0.2, linewidth=0, rasterized=True),
                diag_kws=dict(rasterized=True),
            )
            if "{resolution}" in output_prefix:
                op = output_prefix.format(resolution=resolution)
            savefig(
                grid.fig,
                os.path.join(output_dir, op + ".all_samples.svg"))

        # if per_sample:


[docs]def all_to_igv(matrix, output_prefix, **kwargs):
    """
    Convert dictionary of DataFrame with CNV data in several resolutions to IGV format.

    Parameters
    ----------
    matrix : :obj:`pandas.DataFrame`
        DataFrame with CNV data to convert.

    output_prefix : str
        Prefix to add to plots.

    **kwargs : :obj:`dict`, optional
        Additional parameters will be passed to ngs_toolkit.cnv.to_igv

    Returns
    -------
    dict
        Dictionary of CNV data in IGV format for each resolution.
    """
    from tqdm import tqdm

    igvs = dict()
    resolutions = matrix.keys()
    for resolution in tqdm(resolutions, total=len(resolutions), desc="Resolution"):
        _LOGGER.info("Making IGV visualization for resolution '{}'.".format(resolution))
        igvs[resolution] = to_igv(
            matrix[resolution],
            output_file="{}_{}.igv".format(output_prefix, resolution),
            **kwargs
        )

    return igvs


[docs]def to_igv(matrix, output_file=None, save=True, view_limits=(-2, 2)):
    """
    Convert DataFrame with CNV data to IGV format.

    Parameters
    ----------
    matrix : :obj:`pandas.DataFrame`
        DataFrame with CNV data to convert.
    output_file : str, optional
        Output file.

        Required if `save` is True.
    save: :obj:`bool`, optional
        Whether results should be saved to disc.

        Defaults to :obj:`True`.
    view_limits : tuple, optional
        Extreme values (min, max) of color scale used to visualize in IGV.

        Defaults to (-2, 2).

    Returns
    -------
    pandas.DataFrame
        CNV data in IGV format.

    Raises
    -------
    ValueError:
        If `save` is True but `output_file` is None.
    """
    _LOGGER.info("Making IGV visualization")

    # as IGV file
    igv = pd.DataFrame(index=matrix.index)
    igv.loc[:, "Chromosome"] = list(map(lambda x: x[0], matrix.index.str.split(":")))
    igv.loc[:, "Start"] = list(map(lambda x: int(x[1].split("-")[0]), matrix.index.str.split(":")))
    igv.loc[:, "End"] = list(map(lambda x: int(x[1].split("-")[1]), matrix.index.str.split(":")))
    igv.loc[:, "Name"] = igv.index

    igv = igv.join(matrix).reset_index(drop=True)

    if save:
        if output_file is None:
            raise ValueError(
                "If the 'save' option is specified, 'output_file' must also be!"
            )
        open(output_file, "w")
        output_handle = open(output_file, "a")
        output_handle.write(
            "#track viewLimits=-{}:{} graphType=heatmap color=255,0,0\n".format(
                *view_limits
            )
        )
        igv.to_csv(output_handle, sep="\t", index=False)
        output_handle.close()

    return igv