Source code for ngs_toolkit.general

#!/usr/bin/env python


import os

import numpy as np
import pandas as pd

from ngs_toolkit import _CONFIG, _LOGGER
from ngs_toolkit.utils import get_this_file_or_timestamped


[docs]def get_genome_reference(
    organism,
    genome_assembly=None,
    output_dir=None,
    genome_provider="UCSC",
    file_format="2bit",
    dry_run=False,
    overwrite=True,
):
    """
    Get genome FASTA/2bit file.
    Saves results to disk and returns path to file.

    Parameters
    ----------
    organism : :obj:`str`
        Organism to get annotation for. Currently supported: "human" and "mouse".

    output_dir : :obj:`str`, optional
        Directory to write output to.
        Defaults to current directory

    genome_provider : :obj:`str`, optional
        Which genome provider to use. One of 'UCSC' or 'Ensembl'.

    file_format : :obj:`str`, optional
        File format to get. One of 'fasta' or '2bit'.

    dry_run: :obj:`bool`, optional
        Whether to not download and just return path to file.

    overwrite: :obj:`bool`, optional
        Whether existing files should be overwritten by new ones.
        Otherwise they will be kept and no action is made.
        Defaults to :obj:`True`.

    Returns
    -------
    {str, tuple}
        Path to genome FASTA/2bit file,
        but if `dry_run` tuple of URL of reference genome and path to file.

    Raises
    --------
    ValueError
        If arguments are not in possible options or if desired combination
        is not available.
    """
    import pybedtools
    import subprocess
    import distutils.spawn

    from ngs_toolkit.utils import download_gzip_file, download_file

    def index_fasta(fasta):
        """
        # The idea is to use a hidden method of bedtools
        # to create an index (to skip having e.g. samtools as dependency)
        # and use the bedtool nuc command to to do it.
        # This actually fails to get nucleotide content every time due to this 'bug':
        # https://github.com/daler/pybedtools/issues/147
        # but nonetheless creates an index :whatever:
        """
        bed = pd.DataFrame([["chr1", 1, 2]])
        try:
            bed = pybedtools.BedTool().from_dataframe(bed)
            bed.nucleotide_content(fi=fasta)
        except pybedtools.helpers.BEDToolsError:
            pass

    def twobit_to_fasta(genome_file):
        if distutils.spawn.find_executable("twoBitToFa") is not None:
            args = ["twoBitToFa"] + [genome_file, genome_file.replace(".2bit", ".fa")]
            subprocess.check_output(args, shell=False)
            index_fasta(genome_file.replace(".2bit", ".fa"))

    if output_dir is None:
        output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    opts = ["UCSC", "Ensembl"]
    if genome_provider not in opts:
        msg = "`genome_provider` attribute must be one of '{}'.".format(", ".join(opts))
        _LOGGER.error(msg)
        raise ValueError(msg)

    opts = ["fasta", "2bit"]
    if file_format not in opts:
        msg = "`file_format` attribute must be one of '{}'.".format(", ".join(opts))
        _LOGGER.error(msg)
        raise ValueError(msg)

    if (genome_provider == "Ensembl") and (file_format == "2bit"):
        msg = "Ensembl does not provide 2bit files."
        hint = " Use for example 'faToTwoBit' to convert the FASTA file."
        _LOGGER.error(msg + hint)
        raise ValueError(msg)

    if genome_provider == "UCSC":
        default_organisms = {
            "human": "hg38",
            "hsapiens": "hg38",
            "homo_sapiens": "hg38",
            "mouse": "mm10",
            "mmusculus": "mm10",
            "mus_musculus": "mm10",
            "yeast": "sacCer3",
            "scerevisiae": "sacCer3",
            "saccharomyces_cerevisiae": "sacCer3",
        }
        base_link = (
            "http://hgdownload.cse.ucsc.edu/goldenPath/{assembly}/bigZips/{assembly}"
        )
        base_link += ".fa.gz" if file_format == "fasta" else ".2bit"
        if genome_assembly is None:
            genome_assembly = default_organisms[organism]
        url = base_link.format(assembly=genome_assembly)

    elif genome_provider == "Ensembl":
        default_organisms = {
            "human": {
                "long_species": "homo_sapiens",
                "version": "grch38",
                "release": "75",
            },
            "hsapiens": {
                "long_species": "homo_sapiens",
                "version": "grch38",
                "release": "75",
            },
            "homo_sapiens": {
                "long_species": "homo_sapiens",
                "version": "grch38",
                "release": "75",
            },
            "mouse": {
                "long_species": "mus_musculus",
                "version": "grcm38",
                "release": "94",
            },
            "mmusculus": {
                "long_species": "mus_musculus",
                "version": "grcm38",
                "release": "94",
            },
            "mus_musculus": {
                "long_species": "mus_musculus",
                "version": "grcm38",
                "release": "94",
            },
            "yeast": {
                "long_species": "saccharomyces_cerevisiae",
                "version": "R64",
                "release": "94",
            },
            "scerevisiae": {
                "long_species": "saccharomyces_cerevisiae",
                "version": "R64",
                "release": "94",
            },
            "saccharomyces_cerevisiae": {
                "long_species": "saccharomyces_cerevisiae",
                "version": "R64",
                "release": "94",
            },
        }
        if genome_assembly is None:
            genome_assembly = default_organisms[organism]["version"].replace("grc", "GRC")
        base_link = (
            "ftp://ftp.ensembl.org/pub/release-{release}/fasta/{long_organism}/dna/"
        )
        base_link += "{Clong_organism}.{assembly}."
        base_link += (
            "{}.".format(default_organisms[organism]["release"])
            if genome_assembly.endswith("37")
            else ""
        )
        base_link += "{sequence_type}.{id_type}.{id}fa.gz".format(
            sequence_type="dna", id_type="primary_assembly", id=""
        )
        url = base_link.format(
            release=default_organisms[organism]["release"],
            long_organism=default_organisms[organism]["long_species"],
            Clong_organism=default_organisms[organism]["long_species"].capitalize(),
            assembly=genome_assembly,
        )

    if (
        (genome_provider == "UCSC")
        and (file_format == "fasta")
        and (genome_assembly != "hg38")
    ):
        msg = "UCSC does not provide FASTA files for the {} assembly.".format(
            genome_assembly
        )
        hint = " Download a 2bit file and use for example 'TwoBitToFa' to convert."
        _LOGGER.error(msg + hint)
        raise ValueError(msg)

    genome_file = os.path.join(
        output_dir,
        "{}.{}.{}".format(
            organism, genome_assembly, "fa.gz" if file_format == "fasta" else "2bit"
        ),
    )

    if os.path.exists(genome_file) and (not overwrite):
        msg = "Genome file already exists and 'overwrite' is set to False."
        hint = " Returning existing file: {}".format(genome_file)
        _LOGGER.warning(msg + hint)
        # even so, if 2bit and FASTA not there try to get fasta
        if file_format == "2bit":
            if not os.path.exists(genome_file.replace(".2bit", ".fa")):
                twobit_to_fasta(genome_file)
        return genome_file

    # create .fai index for fasta file
    if file_format == "fasta":
        if not dry_run:
            download_gzip_file(url, genome_file)
            index_fasta(genome_file)
            return genome_file
    else:
        if not dry_run:
            download_file(url, genome_file)
            twobit_to_fasta(genome_file)
            return genome_file

    return (url, genome_file)


[docs]def get_blacklist_annotations(
    organism, genome_assembly=None, output_dir=None, overwrite=True
):
    """
    Get annotations of blacklisted genomic regions for a given organism/genome assembly.
    Saves results to disk and returns a path to a BED file.

    Parameters
    ----------
    organism : :obj:`str`
        Organism to get annotation for. Currently supported: "human" and "mouse".

    genome_assembly : :obj:`str`, optional
        Ensembl assembly/version to use.
        Default for "human" is "hg38/grch38" and for "mouse" is "mm10/grcm38".

    output_dir : :obj:`str`, optional
        Directory to write output to.
        Defaults to "reference" in current directory.

    overwrite: :obj:`bool`, optional
        Whether existing files should be overwritten by new ones.
        Otherwise they will be kept and no action is made.
        Defaults to :obj:`True`.

    Returns
    -------
    str
        Path to blacklist BED file
    """
    from ngs_toolkit.utils import download_gzip_file

    if output_dir is None:
        output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    default_organisms = {"human": "hg38", "mouse": "mm10"}
    if genome_assembly is None:
        genome_assembly = default_organisms[organism]
        _LOGGER.warning(
            "Genome assembly not selected. Using assembly '{}' for '{}'.".format(
                genome_assembly, organism
            )
        )

    output_file = os.path.join(
        output_dir, "{}.{}.blacklist.bed".format(organism, genome_assembly)
    )
    if os.path.exists(output_file) and (not overwrite):
        msg = "Annotation file already exists and 'overwrite' is set to False."
        hint = " Returning existing annotation file: {}".format(output_file)
        _LOGGER.warning(msg + hint)
        return output_file

    url = "http://mitra.stanford.edu/kundaje/akundaje/release/blacklists/"
    if genome_assembly not in ["hg19"]:
        url += "{0}-{1}/{0}.blacklist.bed.gz".format(genome_assembly, organism)
    else:
        url += "{0}-human/wgEncodeHg19ConsensusSignalArtifactRegions.bed.gz".format(
            genome_assembly
        )

    try:
        download_gzip_file(url, output_file)
    except OSError:
        msg = "Could not download file: {}".format(url)
        _LOGGER.error(msg)
        raise OSError(msg)
    return output_file


[docs]def get_tss_annotations(
        organism,
        genome_assembly=None,
        save=True,
        output_dir=None,
        chr_prefix=True,
        gene_types=["protein_coding", "processed_transcript", "lincRNA", "antisense"],
        overwrite=True,
):
    """
    Get annotations of TSS for a given organism/genome assembly.
    This is a simple approach using Biomart's API querying the Ensembl database.
    Saves results to disk and returns a dataframe.

    Parameters
    ----------
    organism : :obj:`str`
        Organism to get annotation for.
        Currently supported: "human" and "mouse".

    genome_assembly : :obj:`str`, optional
        Ensembl assembly/version to use.
        Default for "human" is "grch38" and for "mouse" is "grcm38".

    save: :obj:`bool`, optional
        Whether to save to disk under ``output_dir``.
        Defaults to :obj:`True`.

    output_dir : :obj:`str`, optional
        Directory to write output to.
        Defaults to "reference" in current directory.

    chr_prefix: :obj:`bool`, optional
        Whether chromosome names should have the "chr" prefix.
        Defaults to True

    gene_types : :obj:`list`, optional
        Subset of transcript biotypes to keep.
        See here the available biotypes https://www.ensembl.org/Help/Faq?id=468
        Defaults to 'protein_coding', 'processed_transcript', 'lincRNA', 'antisense'.

    overwrite: :obj:`bool`, optional
        Whether existing files should be overwritten by new ones.
        Otherwise they will be kept and no action is made.
        Defaults to :obj:`True`.

    Returns
    -------
    :obj:`pandas.DataFrame`
        DataFrame with genome annotations
    """
    default_organisms = {
        "human": {"species": "hsapiens", "ensembl_version": "grch38"},
        "mouse": {"species": "mmusculus", "ensembl_version": "grcm38"},
        "yeast": {"species": "scerevisiae", "ensembl_version": "R64"},
    }

    if genome_assembly is None:
        genome_assembly = default_organisms[organism]["ensembl_version"]
    if genome_assembly == "hg19":
        genome_assembly = "grch37"
    if genome_assembly == "hg38":
        genome_assembly = "grch38"
    if genome_assembly == "mm10":
        genome_assembly = "grcm38"
    if genome_assembly == "sacCer3":
        genome_assembly = "R64"

    output_file = os.path.join(
        output_dir, "{}.{}.gene_annotation.tss.bed".format(organism, genome_assembly)
    )
    o = get_this_file_or_timestamped(output_file)
    if os.path.exists(o) and (not overwrite):
        msg = "Annotation file already exists and 'overwrite' is set to False."
        hint = " Returning existing annotation from file: {}".format(o)
        _LOGGER.warning(msg + hint)
        annot = pd.read_csv(o, header=None, sep="\t")
        return annot

    if save:
        if output_dir is None:
            output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

    attributes = [
        "chromosome_name",
        "start_position",
        "ensembl_gene_id",
        "external_gene_name",
        "strand",
        "transcript_biotype",
    ]
    res = query_biomart(
        attributes=attributes,
        species=default_organisms[organism]["species"],
        ensembl_version=genome_assembly,
    )

    if gene_types is None:
        res = res.drop(["transcript_biotype"], axis=1).drop_duplicates()
    else:
        res = res.loc[res["transcript_biotype"].isin(gene_types), :]
    if chr_prefix:
        res.loc[:, "chromosome_name"] = "chr" + res["chromosome_name"]
    res.loc[:, "start_position"] = res["start_position"].astype(int)
    res.loc[:, "strand"] = res["strand"].replace("1", "+").replace("-1", "-")
    res.loc[:, "end"] = res.apply(
        lambda x: x["start_position"] + 1
        if x["strand"] == "+"
        else x["start_position"],
        axis=1,
    )
    res.loc[:, "start_position"] = res.apply(
        lambda x: x["start_position"] - 1
        if x["strand"] == "-"
        else x["start_position"],
        axis=1,
    )

    # drop same gene duplicates if starting in same position but just annotated with
    # different biotypes
    res = (
        res[attributes[:2] + ["end"] + attributes[2:]]
        .sort_values(by=res.columns.tolist(), axis=0)
        .drop_duplicates(subset=res.columns[:3], keep="last")
    )

    # make real BED format
    res.loc[:, "fill"] = "."
    cols = [
        "chromosome_name",
        "start_position",
        "end",
        "external_gene_name",
        "fill",
        "strand",
        "ensembl_gene_id",
        "transcript_biotype",
    ]
    res = res.loc[:, cols]
    res.columns = [
        "chr",
        "start",
        "end",
        "gene_name",
        "score",
        "strand",
        "ensembl_gene_id",
        "transcript_biotype",
    ]

    # save
    if save:
        res.to_csv(output_file, index=False, header=False, sep="\t")

        output_file = os.path.join(
            output_dir,
            "{}.{}.gene_annotation.protein_coding.tss.bed".format(
                organism, genome_assembly
            ),
        )
        res[res["transcript_biotype"] == "protein_coding"].drop(
            attributes[-1], axis=1
        ).to_csv(output_file, index=False, header=False, sep="\t")

    return res


[docs]def get_genomic_context(
        organism,
        genome_assembly=None,
        save=True,
        output_dir=None,
        chr_prefix=True,
        region_subset=[
            "promoter",
            "exon",
            "5utr",
            "3utr",
            "intron",
            "genebody",
            "intergenic",
        ],
        gene_types=["protein_coding", "processed_transcript", "lincRNA", "antisense"],
        promoter_width=3000,
        overwrite=True,
):
    """
    Get annotations of TSS for a given organism/genome assembly.
    This is a simple approach using Biomart's API querying the Ensembl database.
    Saves results to disk and returns a dataframe.

    The API call to BioMart can take a bit, so the function should take ~4 min for a human genome.

    Parameters
    ----------
    organism : :obj:`str`
        Organism to get annotation for. Currently supported: "human" and "mouse".

    genome_assembly : :obj:`str`, optional
        Ensembl assembly/version to use.
        Default for "human" is "grch38" and for "mouse" is "grcm38".

    save: :obj:`bool`, optional
        Whether to save to disk under ``output_dir``.
        Defaults to :obj:`True`.

    output_dir : :obj:`str`, optional
        Directory to write output to.
        Defaults to "reference" in current directory.

    chr_prefix: :obj:`bool`, optional
        Whether chromosome names should have the "chr" prefix. Defaults to True

    gene_types : :obj:`list`, optional
        Subset of transcript biotypes to keep.
        See here the available biotypes https://www.ensembl.org/Help/Faq?id=468
        Defaults to 'protein_coding', 'processed_transcript', 'lincRNA', 'antisense'.

    overwrite: :obj:`bool`, optional
        Whether existing files should be overwritten by new ones.
        Otherwise they will be kept and no action is made.
        Defaults to :obj:`True`.

    Returns
    -------
    :obj:`pandas.DataFrame`
        DataFrame with genome annotations
    """
    from pybedtools import BedTool
    import pybedtools

    default_organisms = {
        "human": {
            "species": "hsapiens",
            "ensembl_version": "grch38",
            "ucsc_version": "hg38",
        },
        "mouse": {
            "species": "mmusculus",
            "ensembl_version": "grcm38",
            "ucsc_version": "mm10",
        },
        "yeast": {"species": "scerevisiae", "ensembl_version": "R64"},
    }

    if genome_assembly is None:
        genome_assembly = default_organisms[organism]["ucsc_version"]
        ensembl_genome_assembly = default_organisms[organism]["ensembl_version"]
    elif genome_assembly == "hg19":
        ensembl_genome_assembly = "grch37"
    elif genome_assembly == "hg38":
        ensembl_genome_assembly = "grch38"
    elif genome_assembly == "mm10":
        ensembl_genome_assembly = "grcm38"
    elif genome_assembly == "sacCer3":
        ensembl_genome_assembly = "R64"
    else:
        _LOGGER.warning()
        ensembl_genome_assembly = genome_assembly

    output_file = os.path.join(
        output_dir,
        "{}.{}.genomic_context.bed".format(organism, ensembl_genome_assembly))
    o = get_this_file_or_timestamped(output_file)
    if os.path.exists(o) and (not overwrite):
        msg = "Annotation file already exists and 'overwrite' is set to False."
        hint = " Returning existing annotation from file: {}".format(output_file)
        _LOGGER.warning(msg + hint)
        annot = pd.read_csv(output_file, header=None, sep="\t")
        return annot

    if save:
        if output_dir is None:
            output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

    attrs = [
        "ensembl_gene_id",
        "chromosome_name",
        "start_position",
        "end_position",
        "exon_chrom_start",
        "exon_chrom_end",
        "5_utr_start",
        "5_utr_end",
        "3_utr_start",
        "3_utr_end",
        "strand",
        "external_gene_name",
        "gene_biotype",
    ]
    res = query_biomart(
        attributes=attrs,
        species=default_organisms[organism]["species"],
        ensembl_version=ensembl_genome_assembly,
    )

    if chr_prefix:
        res["chromosome_name"] = "chr" + res["chromosome_name"]

    # Keep only desired gene types
    res = res.loc[res["gene_biotype"].isin(gene_types), :]
    # Get strand
    res.loc[:, "strand"] = res["strand"].replace("1", "+").replace("-1", "-")
    # remove exons which are also UTRs

    bed_cols = ["chromosome_name", "start_position", "end_position", "strand"]
    # # Promoters = genebody:start +/- N
    promoter = res[bed_cols].drop_duplicates()
    for col in promoter.columns:
        if ("start" in col) or ("end" in col):
            promoter.loc[:, col] = promoter.loc[:, col].astype(int)

    promoter.loc[promoter["strand"] == "+", "start_position"] = promoter.loc[
        promoter["strand"] == "+", "start_position"
    ] - (promoter_width / 2)
    promoter.loc[promoter["strand"] == "-", "start_position"] = promoter.loc[
        promoter["strand"] == "-", "end_position"
    ] - (
        promoter_width / 2
    )  # + 1
    promoter.loc[promoter["strand"] == "+", "end_position"] = promoter.loc[
        :, "start_position"
    ] + (promoter_width / 2)
    for col in bed_cols[1:3]:
        promoter.loc[:, col] = promoter.loc[:, col].astype(int)

    # # # fix where promoter start < 0
    promoter.loc[promoter["start_position"] < 0, "start_position"] = 0
    # # # remove end < start
    promoter = promoter.loc[~(promoter["start_position"] > promoter["end_position"])]

    # # genebody = start->end + promoter
    gb = res[bed_cols].drop_duplicates()
    for col in gb.columns:
        if ("start" in col) or ("end" in col):
            gb.loc[:, col] = gb.loc[:, col].astype(int)
    gb = gb.append(promoter)
    # for col in bed_cols[1:3]:
    #     gb.loc[:, col] = gb.loc[:, col].astype(int)
    gb = gb.sort_values(gb.columns.tolist())
    genebody_bed = BedTool.from_dataframe(gb)
    genebody_bed = genebody_bed.sort().merge()
    genebody = genebody_bed.to_dataframe()

    # # Exon
    exon = (
        res[["chromosome_name", "exon_chrom_start", "exon_chrom_end"]]
        .drop_duplicates()
        .dropna()
    )
    # # 5utr
    utr5 = (
        res[["chromosome_name", "5_utr_start", "5_utr_end"]].drop_duplicates().dropna()
    )
    # # 3utr
    utr3 = (
        res[["chromosome_name", "3_utr_start", "3_utr_end"]].drop_duplicates().dropna()
    )
    for d in [exon, utr5, utr3]:
        for col in d.columns:
            if ("start" in col) or ("end" in col):
                d.loc[:, col] = d.loc[:, col].astype(int)

    # # Introns = genebody - (promoter + exon + 5utr + 3utr)
    promoter = promoter.drop(["strand"], axis=1)
    exon.columns = utr3.columns = utr5.columns = promoter.columns = bed_cols[:3]
    others = promoter.append(exon).append(utr5).append(utr3)
    for col in others.columns[1:]:
        others.loc[:, col] = others.loc[:, col].astype(int)
    intron = genebody_bed.subtract(BedTool.from_dataframe(others))
    intron = intron.to_dataframe()

    # # Intergenic = ChromSizes - genebody
    cs = pybedtools.get_chromsizes_from_ucsc(genome_assembly)
    cs = pd.DataFrame(cs).T.reset_index()
    if not chr_prefix:
        cs["index"] = cs["index"].str.subtract("chr", "")
    cs = BedTool.from_dataframe(cs)
    intergenic = cs.subtract(genebody_bed)
    intergenic = intergenic.to_dataframe()

    # join all
    features = ["promoter", "genebody", "exon", "intron", "utr5", "utr3", "intergenic"]
    annot = list()
    for name in features:
        cur = locals()[name]
        cur = cur.iloc[:, list(range(3))]
        cur.columns = bed_cols[:3]
        cur.loc[:, "region_type"] = name
        if save:
            cur.sort_values(cur.columns.tolist()).drop("region_type", axis=1).to_csv(
                os.path.join(
                    output_dir,
                    "{}.{}.genomic_context.{}.bed".format(
                        organism, ensembl_genome_assembly, name
                    ),
                ),
                index=False,
                header=False,
                sep="\t",
            )
        annot.append(cur)

    annot = pd.concat(annot)
    annot = annot.sort_values(annot.columns.tolist())

    # save
    if save:
        annot.to_csv(output_file, index=False, header=False, sep="\t")
    return annot


[docs]def get_chromosome_sizes(
    organism, genome_assembly=None, output_dir=None, overwrite=True
):
    """
    Get a file with the sizes of chromosomes in a given organism/genome assembly.
    Saves results to disk and returns a path to a text file.

    Parameters
    ----------
    organism : :obj:`str`
        Organism to get chromosome sizes for.
        Currently supported: "human" and "mouse".

    genome_assembly : :obj:`str`, optional
        Ensembl assembly/version to use.
        Default for "human" is "hg38/grch38" and for "mouse" is "mm10/grcm38".

    output_dir : :obj:`str`, optional
        Directory to write output to.

        Defaults to "reference" in current directory.
    overwrite: :obj:`bool`, optional
        Whether existing files should be overwritten by new ones.
        Otherwise they will be kept and no action is made.

        Defaults to :obj:`True`.

    Returns
    -------
    str
        Path to text file with chromosome sizes.
    """
    import pybedtools

    if output_dir is None:
        output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    default_organisms = {"human": "hg38", "mouse": "mm10"}
    if genome_assembly is None:
        genome_assembly = default_organisms[organism]
        _LOGGER.warning(
            "Genome assembly not selected. Using assembly '{}' for '{}'.".format(
                genome_assembly, organism
            )
        )

    output_file = os.path.join(
        output_dir, "{}.{}.chromosome_sizes.txt".format(organism, genome_assembly)
    )
    if os.path.exists(output_file) and (not overwrite):
        msg = "Annotation file already exists and 'overwrite' is set to False."
        hint = " Returning existing annotation file: {}".format(output_file)
        _LOGGER.warning(msg + hint)
        return output_file

    sizes = pybedtools.get_chromsizes_from_ucsc(genome_assembly)
    with open(output_file, "w") as handle:
        for chrom, (_, size) in sizes.items():
            handle.write("{}\t{}\n".format(chrom, size))
    return output_file


[docs]def deseq_analysis(
        count_matrix,
        experiment_matrix,
        comparison_table,
        formula,
        output_dir,
        output_prefix,
        overwrite=True,
        alpha=0.05,
        independent_filtering=False,
        create_subdirectories=True,
        save_inputs=True,
        **kwargs
):
    """
    Perform differential comparison analysis with DESeq2.

    .. note::
        Do not include hyphens ("-") in any of the samples or groups names!
        R freaks out with this.

    # TODO: fix hyphens in names issue

    Parameters
    ----------
    count_matrix : :obj:`pandas.DataFrame`
        Data frame of shape (samples, variables) with raw read counts.

    experiment_matrix : :obj:`pandas.DataFrame`
        Data frame with columns "sample_name" and any
        other variables used in the `formula`.

    comparison_table : :obj:`pandas.DataFrame`
        Data frame with columns "comparison_name", "sample_group" and sample_name".

    formula : :obj:`str`
        Formula to test in R/patsy notation.
        Usually something like "~ batch + group".

    output_dir : :obj:`str`
        Output directory for produced files.

    output_prefix : :obj:`str`
        Prefix to add to produced files.

    overwrite: :obj:`bool`, optional
        Whether files existing should be overwritten. Defaults to :obj:`True`.

    alpha : number, optional
        Significance level to reject null hypothesis.
        This in practice has no effect as results for all features will be returned.
        Defaults to 0.05.

    create_subdirectories: :obj:`bool`
        Whether to create subdirectories for the result of each comparison.

    **kwargs: :obj:`dict`
        Additional keyword arguments to be passed to the DESeq function of DESeq2.

    Returns
    -------
    :obj:`pandas.DataFrame`
        Data frame with results, statistics for each feature.
    """
    from tqdm import tqdm
    from ngs_toolkit.utils import r2pandas_df, recarray2pandas_df

    from rpy2.robjects import numpy2ri, pandas2ri, r
    from rpy2.robjects.packages import importr
    numpy2ri.activate()
    pandas2ri.activate()

    importr("DESeq2")

    # order experiment and count matrices in same way
    experiment_matrix = experiment_matrix.set_index("sample_name").loc[
        count_matrix.columns, :
    ]

    # save the matrices just in case
    if save_inputs:
        count_matrix.to_csv(
            os.path.join(output_dir, output_prefix + ".count_matrix.tsv"), sep="\t"
        )
        experiment_matrix.to_csv(
            os.path.join(output_dir, output_prefix + ".experiment_matrix.tsv"), sep="\t"
        )
        comparison_table.to_csv(
            os.path.join(output_dir, output_prefix + ".comparison_table.tsv"), sep="\t"
        )

    # Rename samples to avoid R errors with sample names containing symbols
    count_matrix.columns = ["S{}".format(i) for i in range(len(count_matrix.columns))]
    experiment_matrix.index = [
        "S{}".format(i) for i in range(len(experiment_matrix.index))
    ]
    # Replace hyphens with underscores
    experiment_matrix = experiment_matrix.replace("-", "_")
    comparison_table = comparison_table.replace("-", "_")

    # Run DESeq analysis
    dds = r.DESeqDataSetFromMatrix(
        countData=count_matrix.astype(int),
        colData=experiment_matrix,
        design=r("as.formula")(formula),
    )
    dds = r.DESeq(dds, parallel=True, **kwargs)
    # _save(dds, file=os.path.join(output_dir, output_prefix + ".deseq_dds_object.Rdata"))

    results = list()
    comps = comparison_table["comparison_name"].drop_duplicates().sort_values()
    for comp in tqdm(comps, total=len(comps), desc="Comparison"):
        if create_subdirectories:
            if not os.path.exists(os.path.join(output_dir, comp)):
                os.makedirs(os.path.join(output_dir, comp))
            out_file = os.path.join(
                output_dir, comp, output_prefix + ".deseq_result.{}.csv".format(comp)
            )
        else:
            out_file = os.path.join(
                output_dir, output_prefix + ".deseq_result.{}.csv".format(comp)
            )

        if not overwrite and os.path.exists(get_this_file_or_timestamped(out_file)):
            continue
        _LOGGER.info("Doing comparison '{}'".format(comp))
        a = (
            comparison_table.loc[
                (comparison_table["comparison_name"] == comp)
                & (comparison_table["comparison_side"] >= 1),
                "sample_group",
            ]
            .drop_duplicates()
            .squeeze()
        )
        b = (
            comparison_table.loc[
                (comparison_table["comparison_name"] == comp)
                & (comparison_table["comparison_side"] <= 0),
                "sample_group",
            ]
            .drop_duplicates()
            .squeeze()
        )

        contrast = np.array(["sample_group", a, b])
        try:
            res = r("as.data.frame")(
                r.results(
                    dds,
                    contrast=contrast,
                    alpha=alpha,
                    independentFiltering=independent_filtering,
                    parallel=True,
                )
            )
        except Exception as e:
            _LOGGER.warning("DESeq2 group contrast '{}' didn't work!".format(contrast))
            _LOGGER.error(e)
            contrast = ["sample_group" + a, "sample_group" + b]
            try:
                res = r("as.data.frame")(
                    r.results(
                        dds,
                        contrast=contrast,
                        alpha=alpha,
                        independentFiltering=independent_filtering,
                        parallel=True,
                    )
                )
                _LOGGER.warning(
                    "DESeq2 group contrast '{}' did work now!".format(
                        ", ".join(contrast)
                    )
                )
            except Exception as e2:
                _LOGGER.warning(
                    "DESeq2 group contrast '{}' didn't work either!".format(
                        ", ".join(contrast)
                    )
                )
                _LOGGER.error(e2)
                raise e2

        if isinstance(res, np.recarray):
            res = recarray2pandas_df(res)
            res.index = count_matrix.index

        if not isinstance(res, pd.DataFrame):
            # convert to pandas dataframe
            res = r2pandas_df(res)

        res.loc[:, "comparison_name"] = comp
        res.index.name = "index"

        # save
        res.sort_values("pvalue").to_csv(out_file)
        # append
        results.append(res)

    # save all
    results = pd.concat(results)
    results.index.name = "index"
    results.to_csv(
        os.path.join(output_dir, output_prefix + ".deseq_result.all_comparisons.csv"),
        index=True,
    )

    # return
    return results


[docs]def least_squares_fit(
    matrix,
    design_matrix,
    test_model,
    null_model="~ 1",
    standardize_data=True,
    multiple_correction_method="fdr_bh",
):
    """
    Fit a least squares model with only categorical predictors.
    Computes p-values by comparing the log likelihood ratio of the chosen model to a `null_model`.

    Parameters
    ----------
    matrix : :obj:`pandas.DataFrame`
        A Data frame of shape (samples, variables).

    design_matrix : :obj:`pandas.DataFrame`
        A Data frame of shape (samples, variables) with all the variables in `test_model`.

    test_model : :obj:`str`
        Model design to test in R/patsy notation.

    null_model : :obj:`str`, optional
        Null model design in R/patsy notation. Defaults to "~ 1".

    standardize_data: :obj:`bool`, optional
        Whether data should be standardized prior to fitting. Defaults to :obj:`True`.

    multiple_correction_method : :obj:`str`, optional
        Method to use for multiple test correction.
        See statsmodels.sandbox.stats.multicomp.multipletests. Defaults to "fdr_bh".

    Returns
    -------
    :obj:`pandas.DataFrame`
        Statistics of model fitting and comparison between models for each feature.

    :Example:

    matrix = np.random.random(10000000).reshape(100, 100000)
    P = np.concatenate([[0] * 50, [1] * 50])  # dependent variable
    Q = np.concatenate([[0] * 25, [1] * 25] + [[0] * 25, [1] * 25])  # covariate
    design_matrix = pd.DataFrame([P, Q], index=["P", "Q"]).T
    matrix = matrix.T * ((1 + design_matrix.sum(axis=1)) * 4).values
    matrix = pd.DataFrame(matrix.T)
    test_model = "~ Q + P"
    null_model = "~ Q"
    res = least_squares_fit(matrix, design_matrix, test_model, null_model)
    res.head()

    """
    import patsy
    from scipy import stats
    from scipy.linalg import lstsq
    from sklearn.preprocessing import StandardScaler
    from statsmodels.sandbox.stats.multicomp import multipletests

    if standardize_data:
        norm = StandardScaler()
        matrix = pd.DataFrame(
            norm.fit_transform(matrix), index=matrix.index, columns=matrix.columns
        )

    a1 = patsy.dmatrix(test_model, design_matrix)
    betas1, residuals1, _, _ = lstsq(a1, matrix)

    a0 = patsy.dmatrix(null_model, design_matrix)
    betas0, residuals0, _, _ = lstsq(a0, matrix)

    results = pd.DataFrame(
        betas1.T, columns=a1.design_info.column_names, index=matrix.columns
    )

    # Calculate the log-likelihood ratios
    n = float(matrix.shape[0])
    results["model_residuals"] = residuals1
    results["null_residuals"] = residuals0
    results["model_log_likelihood"] = (
        (-n / 2.0) * np.log(2 * np.pi)
        - n / 2.0 * np.log(results["model_residuals"] / n)
        - n / 2.0
    )
    results["null_log_likelihood"] = (
        (-n / 2.0) * np.log(2 * np.pi)
        - n / 2.0 * np.log(results["null_residuals"] / n)
        - n / 2.0
    )

    results["log_likelihood_ratio"] = (
        results["model_log_likelihood"] - results["null_log_likelihood"]
    )
    results["D_statistic"] = 2 * results["log_likelihood_ratio"]
    results["p_value"] = stats.chi2.sf(
        results["log_likelihood_ratio"], df=betas1.shape[0] - betas0.shape[0]
    )
    results["q_value"] = multipletests(
        results["p_value"], method=multiple_correction_method
    )[1]

    if not standardize_data:
        results["mean"] = matrix.mean(axis=0)

    return results


[docs]def differential_from_bivariate_fit(
    comparison_table,
    matrix,
    output_dir,
    output_prefix,
    n_bins=250,
    multiple_correction_method="fdr_bh",
    plot=True,
    palette="colorblind",
    make_values_positive=False,
):
    """
    Perform differential analysis using a bivariate gaussian fit
    on the relationship between mean and fold-change for each comparison.

    Parameters
    ----------
    comparison_table : :obj:`pandas.DataFrame`
        Dataframe with 'comparison_name', 'comparison_side' and 'sample_name', 'sample_group' columns.

    matrix : :obj:`pandas.DataFrame`
        Matrix of `n_features, n_samples` with normalized, log-transformed values to perform analysis on.

    output_dir : :obj:`str`
        Output directory

    output_prefix : :obj:`str`
        Prefix for outputs.

    n_bins : :obj:`int`
        Number of bins of mean values along which to standardize fold-changes.

    multiple_correction_method : :obj:`str`
        Multiple correction method from `statsmodels.sandbox.stats.multicomp.multipletests`.

    plot: :obj:`bool`
        Whether to generate plots.

    palette : :obj:`str`
        Color palette to use. This can be any matplotlib palette and is passed to `sns.color_palette`.

    make_values_positive: :obj:`bool`
        Whether to transform `matrix` to have minimum value 0. Default False.

    Returns
    -------
    :obj:`pandas.DataFrame`
        Results of fitting and comparison between groups for each feature.
    """
    import matplotlib.pyplot as plt
    from ngs_toolkit.graphics import savefig
    from scipy.stats import gaussian_kde
    import seaborn as sns
    from statsmodels.sandbox.stats.multicomp import multipletests

    comparisons = comparison_table["comparison_name"].drop_duplicates().sort_values()
    if plot:
        fig, axis = plt.subplots(
            2,
            len(comparisons),
            figsize=(4 * len(comparisons), 4 * 2),
            sharex=True,
            sharey="row",
        )

    if make_values_positive:
        matrix = matrix + abs(matrix.min().min())

    results = pd.DataFrame()
    for i, comparison in enumerate(comparisons):
        _LOGGER.info("Doing comparison '{}'".format(comparison))
        out_file = os.path.join(
            output_dir, output_prefix + ".fit_result.{}.csv".format(comparison)
        )

        sa = comparison_table.loc[
            (comparison_table["comparison_name"] == comparison)
            & (comparison_table["comparison_side"] == 1),
            "sample_name",
        ]
        ga = comparison_table.loc[
            (comparison_table["comparison_name"] == comparison)
            & (comparison_table["comparison_side"] == 1),
            "sample_group",
        ].squeeze()
        sb = comparison_table.loc[
            (comparison_table["comparison_name"] == comparison)
            & (comparison_table["comparison_side"] == 0),
            "sample_name",
        ]
        gb = comparison_table.loc[
            (comparison_table["comparison_name"] == comparison)
            & (comparison_table["comparison_side"] == 0),
            "sample_group",
        ].squeeze()
        a = matrix.loc[:, sa].mean(axis=1)
        a.name = ga
        b = matrix.loc[:, sb].mean(axis=1)
        b.name = gb

        # assemble stats
        res = a.to_frame()
        res = res.join(b)
        res["global_mean"] = matrix.mean(axis=1)
        res["global_std"] = matrix.std(axis=1)
        res["comparison_mean"] = res.mean(axis=1)
        res["comparison_std"] = res.mean(axis=1)
        res["log2FoldChange"] = np.log2(res[ga] / res[gb])
        res["comparison_name"] = comparison

        # standardize fold change
        bounds = np.linspace(0, res["comparison_mean"].max(), n_bins)
        for (start, end) in zip(bounds[:-2], bounds[1:-1]):
            r = res.loc[
                (res["comparison_mean"] > start) & (res["comparison_mean"] < end)
            ].index
            v = res.loc[r, "log2FoldChange"]
            res.loc[r, "norm_log2FoldChange"] = (v - np.nanmean(v)) / np.nanstd(v)

        # let's try a bivariate gaussian kernel
        # separately for positive and negative to avoid biases in center of mass
        kernel = gaussian_kde(
            res.loc[
                res["norm_log2FoldChange"] > 0,
                ["comparison_mean", "norm_log2FoldChange"],
            ].T.values
        )
        res.loc[res["norm_log2FoldChange"] > 0, "density"] = kernel(
            res.loc[
                res["norm_log2FoldChange"] > 0,
                ["comparison_mean", "norm_log2FoldChange"],
            ].T.values
        )
        kernel = gaussian_kde(
            res.loc[
                res["norm_log2FoldChange"] <= 0,
                ["comparison_mean", "norm_log2FoldChange"],
            ].T.values
        )
        res.loc[res["norm_log2FoldChange"] <= 0, "density"] = kernel(
            res.loc[
                res["norm_log2FoldChange"] <= 0,
                ["comparison_mean", "norm_log2FoldChange"],
            ].T.values
        )

        # Let's calculate something like an empirical p-value on the density
        res["pvalue"] = (res["density"] - res["density"].min()) / (
            res["density"].max() - res["density"].min()
        )
        res["padj"] = multipletests(
            res["pvalue"].fillna(1), method=multiple_correction_method
        )[1]

        res["direction"] = (res["norm_log2FoldChange"] >= 0).astype(int).replace(0, -1)
        res.to_csv(out_file)

        if plot:
            axis[0, i].scatter(
                res["comparison_mean"],
                res["log2FoldChange"],
                alpha=0.2,
                s=5,
                color=sns.color_palette(palette)[0],
                rasterized=True,
            )
            axis[0, i].axhline(0, color="black", linestyle="--")
            axis[1, i].scatter(
                res["comparison_mean"],
                res["norm_log2FoldChange"],
                alpha=0.2,
                s=5,
                color=sns.color_palette(palette)[0],
                rasterized=True,
            )
            diff = res.loc[
                (res["pvalue"] < 0.05) & (res["norm_log2FoldChange"].abs() >= 2), :
            ].index
            axis[0, i].scatter(
                res.loc[diff, "comparison_mean"],
                res.loc[diff, "log2FoldChange"],
                alpha=0.2,
                s=5,
                color=sns.color_palette(palette)[1],
                rasterized=True,
            )
            axis[1, i].scatter(
                res.loc[diff, "comparison_mean"],
                res.loc[diff, "norm_log2FoldChange"],
                alpha=0.2,
                s=5,
                color=sns.color_palette(palette)[1],
                rasterized=True,
            )
            axis[1, i].axhline(0, color="black", linestyle="--")
            axis[0, i].set_title(comparison + "\n" + ga + " vs " + gb)
            axis[1, i].set_xlabel("Comparison mean")
            if i == 0:
                axis[0, i].set_ylabel("log2 fold-change")
                axis[1, i].set_ylabel("Norm(log2 fold-change)")

        results = results.append(res.reset_index(), ignore_index=True)

    # save figure
    savefig(
        fig,
        os.path.join(
            output_dir, output_prefix + ".deseq_result.all_comparisons.scatter.svg"
        ),
    )

    # save all
    results = results.set_index("index")
    results.to_csv(
        os.path.join(output_dir, output_prefix + ".deseq_result.all_comparisons.csv"),
        index=True,
    )

    return results


# def independent_filtering(df, alpha=0.05, n_quantiles=100):
#     """
#     """
#     req_columns = ["pvalue", "baseMean"]
#     assert all([x in df.columns for x in req_columns])
#     # compute quantiles accross mean and pvalue distributions
#     stats = pd.DataFrame()
#     p = (np.arange(n_quantiles) / float(n_quantiles)) * 100
#     p = np.append(p, 100.)
#     for start, end in zip(p, p[1:]):
#         m = np.log2(1 + df['baseMean'])
#         i = df.loc[
#             (m >= np.percentile(m, start)) &
#             (m <= np.percentile(m, end)), :].index
#         stats.loc[start, "n"] = i.shape[0]
#         stats.loc[start, "mean"] = df.loc[i, "baseMean"].mean()
#         stats.loc[start, "mean_p"] = df.loc[i, "pvalue"].mean()
#         stats.loc[start, "n_sig_p"] = (df.loc[i, "pvalue"] < alpha).sum()
#     # plot
#     fig, axis = plt.subplots(1, 2, figsize=(4 * 2, 4 * 1))
#     axis[0].scatter(stats.index, stats.loc[:, "n_sig_p"])
#     axis[1].scatter(stats.index, -np.log10(stats.loc[:, "mean_p"]))
#     # choose inflection point
#     return


# def fit_curve():
#     def func(x, a, b, c):
#         return a * np.exp(-b * x) + c
#     plt.plot(stats.index, -np.log10(stats['mean_p']), 'b-', label='data')
#     popt, pcov = curve_fit(func, stats.index, -np.log10(stats['mean_p']))
#     plt.plot(stats.index, func(stats.index, *popt), 'r-', label='fit')
#     popt, pcov = curve_fit(func, stats.index, stats['mean_p'], bounds=(0, [3., 2., 1.]))
#     plt.plot(stats.index, func(stats.index, *popt), 'g--', label='fit-with-bounds')
#     plt.xlabel('x')
#     plt.ylabel('y')
#     plt.legend()
#     plt.show()


[docs]def lola(
        bed_files, universe_file, output_folder, genome,
        output_prefixes=None, cpus=8):
    """
    Perform location overlap analysis (LOLA).

    If bed_files is a list with more than one element, use ``output_prefixes``
    to pass a list of prefixes to label the output files for each input BED file.

    Files will be created in ``output_folder`` mimicking the output that the
    R function LOLA::writeCombinedEnrichment writes.

    Requires the R package "LOLA" to be installed:

    .. highlight:: R
    .. code-block:: R

        if (!requireNamespace("BiocManager", quietly = TRUE))
            install.packages("BiocManager")
        BiocManager::install("LOLA")

    Parameters
    ----------
    bed_files : {str, list}
        A string path to a BED file or a list of paths.

    universe_file : :obj:`str`
        A path to a BED file representing the universe from where the BED
        file(s) come from.

    output_folder : :obj:`str`
        Output folder for resulting files.

    genome : :obj:`str`, optional
        Genome assembly from which the BED files come from.
        This is used to get the LOLA databases from the
        :obj:`ngs_toolkit._CONFIG` parameters.

    output_prefixes : :obj:`list`, optional
        A list of strings with prefixes to be used in case
        ``bed_files`` is a list.

    cpus : :obj:`int`, optional
        Number of CPUs/threads to use.

        Defaults to 8.
    """
    from ngs_toolkit.utils import r2pandas_df
    from rpy2.robjects import numpy2ri, pandas2ri, r
    from rpy2.robjects.packages import importr
    numpy2ri.activate()
    pandas2ri.activate()

    importr("LOLA")

    # Get region databases from config
    _LOGGER.info(
        "Getting LOLA databases for genome '%s' from configuration.", genome)

    msg = (
        "LOLA database values in configuration could not be found or understood. "
        "Please add a list of value(s) to this section "
        "'resources:lola:region_databases:%s'. "
        "For an example, see "
        "https://github.com/afrendeiro/toolkit/tree/master/ngs_toolkit/config/example.yaml")
    try:
        databases = _CONFIG["resources"]["lola"]["region_databases"][genome]
    except KeyError:
        _LOGGER.error(msg, genome)
        raise

    if not isinstance(databases, list):
        if isinstance(databases, str):
            databases = list(databases)
        else:
            _LOGGER.error(msg, genome)
            raise KeyError(msg % genome)

    if len(databases) < 1:
        _LOGGER.error(msg)
        raise KeyError(msg)

    if isinstance(bed_files, str):
        bed_files = [bed_files]
    if output_prefixes is None:
        if len(bed_files) > 1:
            msg = (
                "Running more than one BED file at once while only specifying "
                "`output_folder` argument will cause output files to be named "
                "in the form "
                "'{output_folder}/{region_database}.{input_file}.tsv'."
                " To prevent this behaviour, pass a list of arguments to "
                "`output_prefixes`.")
            _LOGGER.warning(msg)
            output_prefixes = [
                rr.replace(os.path.sep, "__").replace(".bed", ".")
                for rr in bed_files
            ]
        else:
            output_prefixes = ["."]

    _LOGGER.info("Reading up universe file '{}'.".format(universe_file))
    universe = r("LOLA::readBed")(universe_file)
    _LOGGER.info("Loading region set databases.")
    _regionDB = r("LOLA::loadRegionDB")(np.array(databases))
    for suffix, bed_file in zip(output_prefixes, bed_files):
        _LOGGER.info("Reading up BED file '{}'.".format(bed_file))
        user_set = r("LOLA::readBed")(bed_file)
        _LOGGER.info("Running LOLA testing for file '{}'.".format(bed_file))
        _lola_results = r("LOLA::runLOLA")(user_set, universe, _regionDB, cores=cpus)
        _LOGGER.info("Converting results from R to Python")
        lola_results = r2pandas_df(_lola_results)
        _LOGGER.info("Saving all results for file '{}'.".format(bed_file))
        lola_results.to_csv(
            os.path.join(output_folder, "allEnrichments" + suffix + "tsv"),
            index=False,
            sep="\t",
        )
        for region_set in lola_results["collection"].drop_duplicates():
            _LOGGER.info("Saving results for collection '%s' only.", region_set)
            lola_results[lola_results["collection"] == region_set].to_csv(
                os.path.join(output_folder, "col_" + region_set + suffix + "tsv"),
                index=False,
                sep="\t",
            )


def meme_ame(
    input_fasta,
    output_dir,
    background_fasta=None,
    organism="human",
    motif_database_file=None,
):
    import subprocess

    if motif_database_file is None:
        # Get region databases from config
        _LOGGER.info(
            "Getting 2bit reference genome for genome '%s' from configuration.",
            organism
        )

        msg = (
            "Reference genome in 2bit format value in configuration could not"
            "be found or understood. "
            "Please add a list of value(s) to this section "
            "'resources:meme:motif_databases:%s'. "
            "For an example, see "
            "https://github.com/afrendeiro/toolkit/tree/master/ngs_toolkit/config/example.yaml")
        try:
            motif_database_file = _CONFIG["resources"]["meme"]["motif_databases"][
                organism
            ]
        except KeyError:
            _LOGGER.error(msg, organism)
            return

        if not isinstance(motif_database_file, str):
            _LOGGER.error(msg, organism)
            return

    # shuffle input in no background is provided
    if background_fasta is None:
        shuffled = input_fasta + ".shuffled"
        cmd = """
        fasta-dinucleotide-shuffle -c 1 -f {0} > {1}
        """.format(
            input_fasta, shuffled
        )
        subprocess.call(cmd.split(" "))

    cmd = (
        "ame --bgformat 1 --scoring avg --method ranksum "
        "--pvalue-report-threshold 0.05 --control {0} -o {1} {2} {3}"
    ).format(
        background_fasta if background_fasta is not None else shuffled,
        output_dir,
        input_fasta,
        motif_database_file,
    )
    subprocess.call(cmd.split(" "))
    # subprocess.call("rm {}".format(shuffled).split(" "))


def homer_motifs(bed_file, output_dir, genome_assembly):
    import subprocess

    cmd = "findMotifsGenome.pl {bed} {genome}r {out_dir} \
    -size 1000 -h -p 2 -len 8,10,12,14 -noknown".format(
        bed=bed_file, genome=genome_assembly, out_dir=output_dir
    )
    subprocess.call(cmd.split(" "))


[docs]def homer_combine_motifs(
    comparison_dirs,
    output_dir,
    region_prefix="differential_analysis",
    reduce_threshold=0.6,
    match_threshold=10,
    info_value=0.6,
    p_value_threshold=1e-25,
    fold_enrichment=None,
    cpus=8,
    run=True,
    distributed=True,
    genome="hg38",
    motif_database=None,
):
    """
    Create consensus of de novo discovered motifs from HOMER

    Parameters
    ----------
    comparison_dirs : :obj:`list`
        Iterable of comparison directories where homer was run.
        Should contain a "homerMotifs.all.motifs" file.

    output_dir : :obj:`str`
        Output directory.

    p_value_threshold : number, optional
        Threshold for inclusion of a motif in the consensus set.
        Defaults to 1e-5

    cpus : number, optional
        Number of available CPUS/threads for multithread processing.
        Defaults to 8

    run: :obj:`bool`, optional
        Whether to run enrichment of each comparison in the consensus motifs.
        Default is True

    distributed: :obj:`bool`, optional
        Whether to run enrichment as a cluster job.
        Default is True

    genome : :obj:`str`
        Genome assembly of the data.
        Default is 'hg38'.

    motif_database : :obj:`str`
        Motif database to restrict motif matching too.

    Returns
    -------
    {str,None}
        If `run` is `False`, returns path to consensus motif file. Otherwise `None`.
    """
    from glob import glob
    import subprocess

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # concatenate files
    out_file = os.path.join(output_dir, "homerMotifs.combined.motifs")
    with open(out_file, "a") as outfile:
        for dir_ in comparison_dirs:
            with open(os.path.join(dir_, "homerMotifs.all.motifs"), "r") as infile:
                for line in infile:
                    outfile.write(line)

    # Filter and get motif consensus
    extra = ""
    if motif_database:
        extra = " -known {}".format(motif_database)
    if fold_enrichment is None:
        fold_enrichment = ""
    else:
        fold_enrichment = " -F " + str(fold_enrichment)
    subprocess.call(
        "compareMotifs.pl {} {} -reduceThresh {} -matchThresh {}{} -pvalue {} -info {}{} -nofacts -cpu {}".format(
            out_file,
            output_dir,
            reduce_threshold,
            match_threshold,
            extra,
            p_value_threshold,
            info_value,
            fold_enrichment,
            cpus,
        ).split(
            " "
        )
    )

    # concatenate consensus motif files
    files = glob(os.path.join(output_dir, "homerResults/*motif"))
    combined_motifs = os.path.join(output_dir, "homerMotifs.filtered.motifs")
    with open(combined_motifs, "w") as outfile:
        for f in files:
            if ("similar" in f) or ("RV" in f):
                continue
            _LOGGER.info(f)
            with open(f, "r") as infile:
                for line in infile:
                    outfile.write(line)

    if run:
        for dir_ in comparison_dirs:
            # delete previous results if existing
            results = os.path.join(dir_, "knownResults.txt")
            if os.path.exists(results):
                _LOGGER.warning(
                    "Deleting previously existing results file: '{}'".format(results)
                )
                os.remove(results)
            # prepare enrichment command with consensus set
            cmd = "findMotifsGenome.pl {bed} {genome}r {dir} -p {cpus} -nomotif -mknown {motif_file}".format(
                bed=os.path.join(dir_, region_prefix + "_regions.bed"),
                genome=genome,
                cpus=cpus,
                dir=dir_,
                motif_file=combined_motifs,
            )
            # run
            if distributed:
                subprocess.call(
                    "sbatch -J homer.{d} -o {dir}.homer.log -p shortq -c 8 --mem 20000".format(
                        d=os.path.basename(dir_), dir=dir_
                    ).split(
                        " "
                    )
                    + ["--wrap", cmd]
                )
            else:
                subprocess.call(cmd.split(" "))


[docs]def enrichr(dataframe, gene_set_libraries=None, kind="genes", max_attempts=5):
    """
    Use Enrichr on a list of genes (currently only genes supported through the API).

    Parameters
    ----------
    dataframe : :obj:`str`
        DataFrame with column "gene_name".

    gene_set_libraries : :obj:`list`, optional
        Gene set libraries to use.
        Defaults to values in initial configuration file.
        To see them, do: ``ngs_toolkit._CONFIG['resources']['enrichr']['gene_set_libraries']``

    kind : :obj:`str`, optional
        Type of input.
        Right now, only "genes" is supported.
        Defaults to "genes"

    max_attempts : :obj:`int`, optional
        Number of times to try a call to Enrichr API.
        Defaults to 5

    Returns
    -------
    :obj:`pandas.DataFrame`
        Results of enrichment analysis

    Raises
    -------
    Exception
        If `max_attempts` is exceeded
    """
    import json
    import requests

    from tqdm import tqdm

    ENRICHR_ADD = "http://amp.pharm.mssm.edu/Enrichr/addList"
    ENRICHR_RETRIEVE = "http://amp.pharm.mssm.edu/Enrichr/enrich"
    query_string = "?userListId={}&backgroundType={}"

    if gene_set_libraries is None:
        # Get region databases from config
        _LOGGER.debug("Getting Enrichr gene set libraries from configuration.")

        msg = "Enrichr gene set libraries value in configuration could not be found or understood. "
        msg += "Please add a list of value(s) to this section 'resources:mem:motif_databases'. "
        msg += "For an example, see https://github.com/afrendeiro/toolkit/tree/master/ngs_toolkit/config/example.yaml"
        try:
            gene_set_libraries = _CONFIG["resources"]["enrichr"]["gene_set_libraries"]
        except KeyError:
            _LOGGER.error(msg)
            return

        if not isinstance(gene_set_libraries, list):
            if isinstance(gene_set_libraries, str):
                gene_set_libraries = list(gene_set_libraries)
            else:
                _LOGGER.error(msg)
                return

        if len(gene_set_libraries) < 1:
            _LOGGER.error(msg)
            return

    if kind == "genes":
        # Build payload with bed file
        attr = "\n".join(dataframe["gene_name"].dropna().tolist())
    elif kind == "regions":
        # Build payload with bed file
        attr = "\n".join(
            dataframe[["chrom", "start", "end"]]
            .apply(lambda x: "\t".join([str(i) for i in x]), axis=1)
            .tolist()
        )

    payload = {"list": (None, attr), "description": (None, "")}
    # Request adding gene set
    response = requests.post(ENRICHR_ADD, files=payload)
    if not response.ok:
        raise Exception("Error analyzing gene list")

    # Track gene set ID
    user_list_id = json.loads(response.text)["userListId"]

    results = pd.DataFrame()
    for gene_set_library in tqdm(
        gene_set_libraries, total=len(gene_set_libraries), desc="Gene set library"
    ):
        _LOGGER.debug("Using Enricher on {} gene set library.".format(gene_set_library))

        # Request enriched sets in gene set
        i = 0
        okay = False
        while not okay:
            if i == max_attempts:
                _LOGGER.error(
                    "API request failed {} times, exceeding `max_attempts`.".format(i)
                )
                raise Exception("Fetching enrichment results maxed `max_attempts`.")
            response = requests.get(
                ENRICHR_RETRIEVE + query_string.format(user_list_id, gene_set_library)
            )
            okay = response.ok
            if not okay:
                _LOGGER.warning("API request failed. Retrying.")

        # Get enriched sets in gene set
        res = json.loads(response.text)
        # If there's no enrichemnt, continue
        if len(res) < 0:
            continue

        # Put in dataframe
        res = pd.DataFrame([pd.Series(s) for s in res[gene_set_library]])
        if res.shape[0] == 0:
            continue
        cols = [
            "rank",
            "description",
            "p_value",
            "z_score",
            "combined_score",
            "genes",
            "adjusted_p_value",
        ]
        if len(res.columns) == 7:
            res.columns = cols
        elif len(res.columns) == 9:
            res.columns = cols + ["old_p_value", "old_adjusted_p_value"]

        # Remember gene set library used
        res["gene_set_library"] = gene_set_library

        # Append to master dataframe
        results = results.append(res, ignore_index=True)

    return results


[docs]def run_enrichment_jobs(
    results_dir,
    genome,
    background_bed,
    steps=["lola", "meme", "homer", "enrichr"],
    overwrite=True,
    pep_config=None,
):
    """
    Submit parallel enrichment jobs for a specific analysis.

    Parameters
    ----------
    :param results_dir:
        Directory with files prepared by ngs_toolkit.general.run_enrichment_jobs

    :param genome:
        Genome assembly of the analysis.

    background_bed : :obj:`str`
        BED file to use as background for LOLA analysis.
        Typically the analysis' own consensus region set.

    steps : :obj:`list`, optional
        Steps of the analysis to perform.
        Defaults to ["region", lola", "meme", "homer", "enrichr"].

    :param overwrite: bool, optional
        Whether output should be overwritten.
        In this case no jobs will be submitted for jobs with existing output files.
        Defaults to True

    :param pep_config: :obj:`str`, optional
        Pickle file of the analysis.
        Only required for "region" enrichment.
    """
    # TODO: replace hardcoded paths with info from resources
    # TODO: remove pep_config requirement to "region_enrichment"
    import sys
    from glob import glob

    from ngs_toolkit.utils import submit_job

    dbs = {
        "human": "~/resources/motifs/motif_databases/HUMAN/HOCOMOCOv10.meme",
        "mouse": "~/resources/motifs/motif_databases/MOUSE/uniprobe_mouse.meme",
    }
    omap = {"hg38": "human", "hg19": "human", "mm10": "mouse"}

    jobs = list()
    # list of tuples with: job_name, log, exec, requirements (partition, cpu, mem, time), cmd

    # REGION
    if "region" in steps:
        files = glob(results_dir + "/*/*_regions.bed")
        for file in files:
            dir_ = os.path.dirname(file)
            name = os.path.basename(dir_)
            output_ = os.path.join(dir_, "region_type_enrichment.csv")
            if os.path.exists(output_) and (not overwrite):
                continue
            jobs.append(
                [
                    name + "_region",
                    os.path.join(dir_, name + ".region.log"),
                    os.path.join(dir_, name + ".region.sh"),
                    ("shortq", 1, 8000, "08:00:00"),
                    "{} -m ngs_toolkit.recipes.region_enrichment --output-file {} {} {}".format(
                        sys.executable, output_, file, pep_config
                    ),
                ]
            )

    # LOLA
    if "lola" in steps:
        files = glob(results_dir + "/*/*_regions.bed")
        for file in files:
            dir_ = os.path.dirname(file)
            name = os.path.basename(dir_)
            output_ = os.path.join(dir_, "allEnrichments.tsv")
            if os.path.exists(output_) and (not overwrite):
                continue
            jobs.append(
                [
                    name + "_lola",
                    os.path.join(dir_, name + ".lola.log"),
                    os.path.join(dir_, name + ".lola.sh"),
                    ("shortq", 2, 12000, "08:00:00"),
                    "{} -m ngs_toolkit.recipes.lola {} {} {} {} -c 2".format(
                        sys.executable, file, background_bed, dir_, genome
                    ),
                ]
            )

    # AME
    if "meme" in steps:
        files = glob(results_dir + "/*/*_regions.fa")
        for file in files:
            dir_ = os.path.dirname(file)
            name = os.path.basename(dir_)
            output_ = os.path.join(dir_, "ame.html")
            if os.path.exists(output_) and (not overwrite):
                continue
            jobs.append(
                [
                    name + "_meme",
                    os.path.join(dir_, name + ".meme_ame.log"),
                    os.path.join(dir_, name + ".meme_ame.sh"),
                    ("shortq", 1, 4000, "08:00:00"),
                    "fasta-dinucleotide-shuffle -c 1 -f {f} > {f}.shuffled.fa\n".format(
                        f=file
                    )
                    + " ame --bgformat 1 --scoring avg --method ranksum --pvalue-report-threshold 0.05"
                    + " --control {f}.shuffled.fa -o {d} {f} {motifs}".format(
                        f=file, d=dir_, motifs=dbs[omap[genome]]
                    ),
                ]
            )

    # HOMER
    if "homer" in steps:
        files = glob(results_dir + "/*/*_regions.bed")
        for file in files:
            dir_ = os.path.dirname(file)
            name = os.path.basename(dir_)
            output_ = os.path.join(dir_, "homerResults.html")
            if os.path.exists(output_) and (not overwrite):
                continue
            jobs.append(
                [
                    name + "_homer",
                    os.path.join(dir_, name + ".homer.log"),
                    os.path.join(dir_, name + ".homer.sh"),
                    ("shortq", 8, 12000, "08:00:00"),
                    "findMotifsGenome.pl {f} {genome}r {d} -size 1000 -h -p 2 -len 8,10,12,14 -noknown".format(
                        f=file, d=dir_, genome=genome
                    ),
                ]
            )

    # Enrichr
    if "enrichr" in steps:
        files = glob(results_dir + "/*/*.gene_symbols.txt")
        for file in files:
            dir_ = os.path.dirname(file)
            name = os.path.basename(dir_)
            output_ = file.replace(".gene_symbols.txt", ".enrichr.csv")
            if os.path.exists(get_this_file_or_timestamped(output_)) and (not overwrite):
                continue
            jobs.append(
                [
                    name + "_enrichr",
                    os.path.join(dir_, name + ".enrichr.log"),
                    os.path.join(dir_, name + ".enrichr.sh"),
                    ("shortq", 1, 4000, "08:00:00"),
                    "{e} -m ngs_toolkit.recipes.enrichr {f} {o}".format(
                        e=sys.executable, f=file, o=output_
                    ),
                ]
            )

    for jobname, log_file, job_file, (partition, cores, mem, time), task in jobs:
        submit_job(
            task, job_file, log_file=log_file,
            jobname=jobname, partition=partition, cores=cores, mem=mem, time=time)


[docs]def project_to_geo(
    project,
    output_dir="geo_submission",
    samples=None,
    distributed=False,
    dry_run=False,
    **kwargs
):
    """
    Prepare raw sequencing files for submission to GEO.
    Files will be copied or generated in a new directory ``output_dir``.
    It will get the raw BAM file(s) of each sample, and in case of
    ATAC-seq/ChIP-seq samples, the bigWig and peak files. If multiple BAM
    files exist for each sample, all will be copied and sequencially named
    with the "fileN" suffix, where "N" is the file number.

    For each copied file a md5sum will be calculated.

    A pandas DataFrame with info on the sample's files and md5sums will be returned.

    Attributes
    ----------
    project : :obj:`peppy.Project`
        A :class:`peppy.Project` object to process.

    output_dir : :obj:`str`, optional
        Directory to create output. Will be created/overwriten if existing.

        Defaults to "geo_submission".
    samples : :obj:`list`, optional
        List of :class:`peppy.Sample` objects in project to restrict to.

        Defaults to all samples in project.
    distributed: :obj:`bool`, optional
        Whether processing should be distributed as jobs in a
        computing cluster for each sample.
        Currently available implementation supports a SLURM cluster only.

        Defaults is :obj:`False`.
    dry_run: :obj:`bool`, optional
        Whether copy/execution/submisison commands should be not be run, to test.

        Default is :obj:`False`.
    **kwargs : :obj:`dict`
        Additional keyword arguments will be passed to
        :meth:`ngs_toolkit.utils.submit_job` if ``distributed`` is :obj:`True`,
        and on to a :class:`divvy` submission template.
        Pass for example:

            * computing_configuration="slurm"
            * jobname="job"
            * cores=2
            * mem=8000
            * partition="longq"

    Returns
    ----------
    :class:`pandas.DataFrame`
        Annotation of samples and their BAM, BigWig,
        narrowPeak files and respective md5sums.
    """
    from ngs_toolkit.utils import submit_job

    output_dir = os.path.abspath(output_dir)
    if samples is None:
        samples = project.samples
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    annot = pd.DataFrame(index=pd.Index([], name="sample_name"))
    for sample in samples:
        various = len(sample.data_path.split(" ")) > 1
        cmd = ""
        for i, file in enumerate(sample.data_path.split(" ")):
            suffix = ".file{}".format(i) if various else ""
            # Copy raw file
            bam_file = os.path.join(output_dir, sample.name + "{}.bam".format(suffix))
            cmd += "cp {} {};\n".format(file, bam_file)
            cmd += "chmod 644 {};\n".format(bam_file)
            annot.loc[sample.name, "bam_file{}".format(i)] = bam_file

            # Copy or generate md5sum
            md5_file = bam_file + ".md5"
            if os.path.exists(file + ".md5"):
                cmd += "cp {} {};\n".format(file + ".md5", md5_file)
            else:
                b = os.path.basename(file)
                cmd += "md5sum {} > {};\n".format(os.path.join(output_dir, b), md5_file)
            cmd += "chmod 644 {};\n".format(md5_file)
            annot.loc[sample.name, "bam_file{}_md5sum".format(i)] = md5_file

        # Copy bigWig files
        if sample.library in ["ATAC-seq", "ChIP-seq"]:
            if hasattr(sample, "bigwig"):
                bigwig_file = os.path.join(output_dir, sample.name + ".bigWig")
                cmd += "cp {} {};\n".format(sample.bigwig, bigwig_file)
                cmd += "chmod 644 {};\n".format(bigwig_file)
                annot.loc[sample.name, "bigwig_file"] = bigwig_file

                # Copy or generate md5sum
                md5_file = bigwig_file + ".md5"
                if os.path.exists(sample.bigwig + ".md5"):
                    cmd += "cp {} {};\n".format(sample.bigwig + ".md5", md5_file)
                else:
                    b = os.path.basename(sample.bigwig)
                    cmd += "md5sum {} > {};\n".format(
                        os.path.join(output_dir, b), md5_file
                    )
                cmd += "chmod 644 {};\n".format(md5_file)
                annot.loc[sample.name, "bigwig_file_md5sum"] = md5_file
            else:
                _LOGGER.warning(
                    "'{}' sample '{}' does not have a 'bigwig'".format(
                        sample.library, sample.name
                    )
                    + " attribute set. Skipping bigWig file."
                )
        # Copy peaks
        if sample.library == "ATAC-seq":
            if hasattr(sample, "peaks"):
                peaks_file = os.path.join(output_dir, sample.name + ".peaks.narrowPeak")
                cmd += "cp {} {};\n".format(sample.peaks, peaks_file)
                cmd += "chmod 644 {};\n".format(peaks_file)
                annot.loc[sample.name, "peaks_file"] = peaks_file

                # Copy or generate md5sum
                md5_file = peaks_file + ".md5"
                if os.path.exists(sample.peaks + ".md5"):
                    cmd += "cp {} {};\n".format(sample.peaks + ".md5", md5_file)
                else:
                    b = os.path.basename(sample.peaks)
                    cmd += "md5sum {} > {};\n".format(peaks_file, md5_file)
                cmd += "chmod 644 {};\n".format(md5_file)
                annot.loc[sample.name, "peaks_file_md5sum"] = md5_file
            else:
                _LOGGER.warning(
                    "'{}' sample '{}' does not have a 'peaks' attribute set.".format(
                        sample.library, sample.name
                    )
                    + " Skipping peaks file."
                )
        cmd += "\ndate\n"

        # Assemble job
        job_name = "project_to_geo.{}".format(sample.name)
        log_file = os.path.join(output_dir, job_name + ".log")
        job_file = os.path.join(output_dir, job_name + ".sh")
        submit_job(
            cmd, job_file, log_file=log_file,
            jobname=job_name, cores=1, mem=8000,
            dry_run=dry_run, **kwargs)

    return annot


[docs]def rename_sample_files(
        annotation_mapping,
        old_sample_name_column="old_sample_name",
        new_sample_name_column="new_sample_name",
        tmp_prefix="rename_sample_files",
        results_dir="results_pipeline",
        dry_run=False):
    """
    Rename existing directories with pipeline outputs for samples
    based on mapping of old/new sample names.

    All files within the directory with the old sample name will be renamed recursively.
    Old and new sample names can overlap - this procedure will handle these cases correctly
    by a 2-step process with temporary sample names with prefix ``tmp_prefix``.

    .. caution:: NEEDS TESTING!
        This function has not been used in a while and needs more testing.

    Attributes
    ----------
    annotation_mapping : :obj:`pandas.DataFrame`
        DataFrame with mapping of old (column "previous_sample_name")
        vs new ("new_sample_name") sample names.

    old_sample_name_column : :obj:`str`, optional
        Name of column with old sample names.

        Defaults to "old_sample_name".
    new_sample_name_column : :obj:`str`, optional
        Name of column with new sample names.

        Defaults to "new_sample_name".
    tmp_prefix : :obj:`str`, optional
        Prefix for temporary files to avoid overlap between old and new names.

        Defaults to "rename_sample_files".
    results_dir : :obj:`str`, optional
        Pipeline output directory containing sample output directories.

        Defaults to "results_pipeline".
    dry_run: :obj:`bool`, optional
        Whether to print commands instead of running them.

        Defaults to :obj:`False`.
    """
    import subprocess
    # TODO: test
    cmds = list()
    # 1) move to tmp name
    for i, series in annotation_mapping.iterrows():
        o = series[old_sample_name_column]
        t = "{}-{}".format(tmp_prefix, i)
        cmds.append("# Moving old sample '{}' to '{}'.".format(o, t))

        # directory
        cmds.append(
            "mv {} {}".format(
                os.path.join(results_dir, o), os.path.join(results_dir, t)
            )
        )
        # further directories
        cmds.append(
            "find {} -type d -exec rename {} {} {{}} \\;".format(
                os.path.join(results_dir, t), o, t
            )
        )
        # files
        cmds.append(
            "find {} -type f -exec rename {} {} {{}} \\;".format(
                os.path.join(results_dir, t), o, t
            )
        )

    # 2) move to new name
    for i, series in annotation_mapping.iterrows():
        t = "{}-{}".format(tmp_prefix, i)
        n = series[new_sample_name_column]
        cmds.append("# Moving old sample '{}' to '{}'.".format(t, n))

        # directory
        cmds.append(
            "mv {} {}".format(
                os.path.join(results_dir, t), os.path.join(results_dir, n)
            )
        )
        # further directories
        cmds.append(
            "find {} -type d -exec rename {} {} {{}} \\;".format(
                os.path.join(results_dir, n), t, n
            )
        )
        # files
        cmds.append(
            "find {} -type f -exec rename {} {} {{}} \\;".format(
                os.path.join(results_dir, n), t, n
            )
        )

    if dry_run:
        _LOGGER.info("\n".join(cmds))
    else:
        for i, cmd in enumerate(cmds):
            _LOGGER.info(i, cmd)
            if cmd.startswith("#"):
                continue
            try:
                r = subprocess.call(cmd.split(" "))
            except OSError as e:
                raise e
            if r != 0:
                raise OSError("Command '{}' failed.".format(cmd))


[docs]def query_biomart(
        attributes=None,
        species="hsapiens",
        ensembl_version="grch38",
        max_api_retries=5):
    """
    Query Biomart (https://www.ensembl.org/biomart/martview/).

    Query Biomart for gene attributes. Returns pandas dataframe with query results.
    If a certain field contains commas, it will attemp to return dataframe but it might fail.

    Parameters
    ----------
    attributes : :obj:`list`, optional
        List of ensembl atrributes to query.

        Defaults to ["ensembl_gene_id", "external_gene_name", "hgnc_id", "hgnc_symbol"].
    species : :obj:`str`, optional
        Ensembl string of species to query. Must be vertebrate.

        Defaults to "hsapiens".
    ensembl_version : :obj:`str`, optional
        Ensembl version to query. Currently "grch37", "grch38" and "grcm38" are tested.

        Defaults to "grch37".
    max_api_retries : :obj:`int`, optional
        How many times to try .

        Defaults to "grch37".

    Returns
    -------
    :obj:`pandas.DataFrame`
        Dataframe with required attributes for each entry.
    """
    import requests
    import time

    supported = ["grch37", "grch38", "grcm38"]
    if ensembl_version not in supported:
        msg = "Ensembl version might not be supported."
        msg += " Tested versions are '{}'.".format("','".join(supported))
        msg += " Will try anyway."
        _LOGGER.warning(msg)

    if attributes is None:
        attributes = ["ensembl_gene_id", "external_gene_name", "hgnc_id", "hgnc_symbol"]

    # Build request XML
    ens_ver = "" if ensembl_version.endswith("38") else ensembl_version + "."
    url_query = "".join(
        [
            """http://{}ensembl.org/biomart/martservice?query=""".format(ens_ver),
            """<?xml version="1.0" encoding="UTF-8"?>""",
            """<!DOCTYPE Query>""",
            """<Query  virtualSchemaName="default" formatter="CSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6" >""",
            """<Dataset name="{}_gene_ensembl" interface="default" >""".format(species),
        ]
        + ["""<Attribute name="{}" />""".format(attr) for attr in attributes]
        + ["""</Dataset>""", """</Query>"""]
    )

    n_fails = 0
    success = False
    while not success:
        req = requests.get(url_query, stream=True)
        if not req.ok:
            n_fails += 1
            msg = "Request to Biomart API was not successful."
            if n_fails == max_api_retries:
                _LOGGER.error(msg)
                raise ValueError(msg)
            else:
                _LOGGER.warning(msg + " Retrying.")
                time.sleep(1)
        else:
            try:
                content = list(req.iter_lines())
                success = True
            except (requests.exceptions.ChunkedEncodingError, requests.urllib3.exceptions.ProtocolError):
                msg = "Retrieving Biomart request failed."
                n_fails += 1
                if n_fails == max_api_retries:
                    _LOGGER.error(msg)
                    raise ValueError(msg)
                else:
                    _LOGGER.warning(msg + " Retrying.")
                    time.sleep(1)

    if (len(content) == 1) and (content[0].startswith("Query ERROR")):
        msg = "Request to Biomart API was not successful. Check your input.\n{}".format(
            content[0]
        )
        _LOGGER.error(msg)
        raise ValueError(msg)

    if isinstance(content[0], bytes):
        content = [x.decode("utf-8") for x in content]

    try:
        mapping = pd.DataFrame(
            [x.strip().split(",") for x in content], columns=attributes
        )
    except AssertionError as e:
        msg = "Could not simply return dataframe with results."
        msg += " It is likely this is because one of the requested attributes has commas as is quoted."
        msg += " Or because querying an organism not present in vertebrate database."
        msg += " Will try to replace these fields and parse."
        _LOGGER.warning(msg)
        # input probably contains commas inside fields
        c = pd.Series([x.strip() for x in content])
        # well's assume that fields with commas have been quoted
        # get text inside double quotes
        cc = c.str.extract('"(.*)"')
        if cc.shape[1] > 1:
            _LOGGER.error("Attempt to fix failed.")
            raise e
        cc = cc.squeeze().str.replace(",", "")
        # now get text until first quote and concatenate with clean text
        f = c.str.extract('(.*),"').fillna("").squeeze() + "," + cc
        # now get back together with instances that didn't have quotes
        c.update(f)
        try:
            mapping = pd.DataFrame(
                [x.strip().split(",") for x in c], columns=attributes
            )
        except AssertionError:
            _LOGGER.error("Attempt to fix failed.")
            raise e
        _LOGGER.info("Attempt to fix successful.")

    return mapping.replace("", np.nan)


[docs]def subtract_principal_component(
    x,
    pc=1,
    norm=False,
    plot=True,
    plot_name="PCA_based_batch_correction.svg",
    max_pcs_to_plot=6,
):
    """
    Given a matrix (n_samples, n_variables), remove `pc` (1-based) from matrix.
    """
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler
    from ngs_toolkit.graphics import savefig

    pc -= 1

    # All regions
    if norm:
        x = StandardScaler().fit_transform(x)

    # PCA
    pca = PCA()
    x_hat = pca.fit_transform(x)

    # Remove PC
    x2 = x - np.outer(x_hat[:, pc], pca.components_[pc, :])

    max_pcs_to_plot = min(max_pcs_to_plot, x2.shape[0]) - 1

    # plot
    if plot:
        x2_hat = pca.fit_transform(x2)
        fig, axis = plt.subplots(max_pcs_to_plot, 2, figsize=(4 * 2, 4 * max_pcs_to_plot))
        for pc in range(max_pcs_to_plot):
            # before
            for j, _ in enumerate(x.index):
                axis[pc, 0].scatter(
                    x_hat[j, pc], x_hat[j, pc + 1], s=50, rasterized=True
                )
            axis[pc, 0].set_xlabel("PC{}".format(pc + 1))
            axis[pc, 0].set_ylabel("PC{}".format(pc + 2))
            # after
            for j, _ in enumerate(x2.index):
                axis[pc, 1].scatter(
                    x2_hat[j, pc], x2_hat[j, pc + 1], s=35, alpha=0.8, rasterized=True
                )
            axis[pc, 1].set_xlabel("PC{}".format(pc + 1))
            axis[pc, 1].set_ylabel("PC{}".format(pc + 2))
        savefig(fig, plot_name)

    return x2


[docs]def subtract_principal_component_by_attribute(df, attributes, pc=1):
    """
    Given a matrix (n_samples, n_variables), remove `pc` (1-based) from matrix.
    """
    from sklearn.decomposition import PCA

    pc -= 1

    x2 = pd.DataFrame(index=df.index, columns=df.columns)
    for attr in attributes:
        _LOGGER.info(attr)
        sel = df.index[df.index.str.contains(attr)]
        x = df.loc[sel, :]

        # PCA
        pca = PCA()
        x_hat = pca.fit_transform(x)

        # Remove PC
        x2.loc[sel, :] = x - np.outer(x_hat[:, pc], pca.components_[pc, :])
    for sample in df.index:
        if x2.loc[sample, :].isnull().all():
            x2.loc[sample, :] = df.loc[sample, :]
    return x2


[docs]def fix_batch_effect_limma(matrix, batch_variable="batch", covariates=None):
    """
    Fix batch effect in matrix using limma.

    Requires the R package "limma" to be installed:

    .. highlight:: R
    .. code-block:: R

        if (!requireNamespace("BiocManager", quietly = TRUE))
            install.packages("BiocManager")
        BiocManager::install("limma")

    Parameters
    ----------
    matrix : :obj:`pandas.DataFrame`
        DataFrame with MultiIndex for potential covariate annotations

    formula : :obj:`str`, optional
        Model formula to regress out
        Defaults to "~batch"

    Returns
    -------
    :obj:`pandas.DataFrame`
        Regressed out matrix
    """
    import patsy
    from rpy2.robjects import numpy2ri, pandas2ri, r
    from rpy2.robjects.packages import importr
    numpy2ri.activate()
    pandas2ri.activate()

    importr("limma")

    if covariates is None:
        covariates = []

    if len(covariates) > 0:
        cov = patsy.dmatrix(
            "~{} - 1".format(" + ".join(covariates)), matrix.columns.to_frame()
        )
        fixed = r("removeBatchEffect")(
            x=matrix.values,
            batch=matrix.columns.get_level_values(batch_variable),
            design=cov,
        )
    else:
        fixed = r("removeBatchEffect")(
            x=matrix.values, batch=matrix.columns.get_level_values(batch_variable)
        )
    fixed = pd.DataFrame(np.asarray(fixed), index=matrix.index, columns=matrix.columns)
    return fixed
    # fixed.to_csv(os.path.join(analysis.results_dir, analysis.name + "_peaks.limma_fixed.csv"))