Source code for ngs_toolkit.general

#!/usr/bin/env python


import os

import numpy as np
import pandas as pd

from ngs_toolkit import _CONFIG, _LOGGER
from ngs_toolkit.utils import get_this_file_or_timestamped


[docs]def get_genome_reference( organism, genome_assembly=None, output_dir=None, genome_provider="UCSC", file_format="2bit", dry_run=False, overwrite=True, ): """ Get genome FASTA/2bit file. Saves results to disk and returns path to file. Parameters ---------- organism : :obj:`str` Organism to get annotation for. Currently supported: "human" and "mouse". output_dir : :obj:`str`, optional Directory to write output to. Defaults to current directory genome_provider : :obj:`str`, optional Which genome provider to use. One of 'UCSC' or 'Ensembl'. file_format : :obj:`str`, optional File format to get. One of 'fasta' or '2bit'. dry_run: :obj:`bool`, optional Whether to not download and just return path to file. overwrite: :obj:`bool`, optional Whether existing files should be overwritten by new ones. Otherwise they will be kept and no action is made. Defaults to :obj:`True`. Returns ------- {str, tuple} Path to genome FASTA/2bit file, but if `dry_run` tuple of URL of reference genome and path to file. Raises -------- ValueError If arguments are not in possible options or if desired combination is not available. """ import pybedtools import subprocess import distutils.spawn from ngs_toolkit.utils import download_gzip_file, download_file def index_fasta(fasta): """ # The idea is to use a hidden method of bedtools # to create an index (to skip having e.g. samtools as dependency) # and use the bedtool nuc command to to do it. # This actually fails to get nucleotide content every time due to this 'bug': # https://github.com/daler/pybedtools/issues/147 # but nonetheless creates an index :whatever: """ bed = pd.DataFrame([["chr1", 1, 2]]) try: bed = pybedtools.BedTool().from_dataframe(bed) bed.nucleotide_content(fi=fasta) except pybedtools.helpers.BEDToolsError: pass def twobit_to_fasta(genome_file): if distutils.spawn.find_executable("twoBitToFa") is not None: args = ["twoBitToFa"] + [genome_file, genome_file.replace(".2bit", ".fa")] subprocess.check_output(args, shell=False) index_fasta(genome_file.replace(".2bit", ".fa")) if output_dir is None: output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference") if not os.path.exists(output_dir): os.makedirs(output_dir) opts = ["UCSC", "Ensembl"] if genome_provider not in opts: msg = "`genome_provider` attribute must be one of '{}'.".format(", ".join(opts)) _LOGGER.error(msg) raise ValueError(msg) opts = ["fasta", "2bit"] if file_format not in opts: msg = "`file_format` attribute must be one of '{}'.".format(", ".join(opts)) _LOGGER.error(msg) raise ValueError(msg) if (genome_provider == "Ensembl") and (file_format == "2bit"): msg = "Ensembl does not provide 2bit files." hint = " Use for example 'faToTwoBit' to convert the FASTA file." _LOGGER.error(msg + hint) raise ValueError(msg) if genome_provider == "UCSC": default_organisms = { "human": "hg38", "hsapiens": "hg38", "homo_sapiens": "hg38", "mouse": "mm10", "mmusculus": "mm10", "mus_musculus": "mm10", "yeast": "sacCer3", "scerevisiae": "sacCer3", "saccharomyces_cerevisiae": "sacCer3", } base_link = ( "http://hgdownload.cse.ucsc.edu/goldenPath/{assembly}/bigZips/{assembly}" ) base_link += ".fa.gz" if file_format == "fasta" else ".2bit" if genome_assembly is None: genome_assembly = default_organisms[organism] url = base_link.format(assembly=genome_assembly) elif genome_provider == "Ensembl": default_organisms = { "human": { "long_species": "homo_sapiens", "version": "grch38", "release": "75", }, "hsapiens": { "long_species": "homo_sapiens", "version": "grch38", "release": "75", }, "homo_sapiens": { "long_species": "homo_sapiens", "version": "grch38", "release": "75", }, "mouse": { "long_species": "mus_musculus", "version": "grcm38", "release": "94", }, "mmusculus": { "long_species": "mus_musculus", "version": "grcm38", "release": "94", }, "mus_musculus": { "long_species": "mus_musculus", "version": "grcm38", "release": "94", }, "yeast": { "long_species": "saccharomyces_cerevisiae", "version": "R64", "release": "94", }, "scerevisiae": { "long_species": "saccharomyces_cerevisiae", "version": "R64", "release": "94", }, "saccharomyces_cerevisiae": { "long_species": "saccharomyces_cerevisiae", "version": "R64", "release": "94", }, } if genome_assembly is None: genome_assembly = default_organisms[organism]["version"].replace("grc", "GRC") base_link = ( "ftp://ftp.ensembl.org/pub/release-{release}/fasta/{long_organism}/dna/" ) base_link += "{Clong_organism}.{assembly}." base_link += ( "{}.".format(default_organisms[organism]["release"]) if genome_assembly.endswith("37") else "" ) base_link += "{sequence_type}.{id_type}.{id}fa.gz".format( sequence_type="dna", id_type="primary_assembly", id="" ) url = base_link.format( release=default_organisms[organism]["release"], long_organism=default_organisms[organism]["long_species"], Clong_organism=default_organisms[organism]["long_species"].capitalize(), assembly=genome_assembly, ) if ( (genome_provider == "UCSC") and (file_format == "fasta") and (genome_assembly != "hg38") ): msg = "UCSC does not provide FASTA files for the {} assembly.".format( genome_assembly ) hint = " Download a 2bit file and use for example 'TwoBitToFa' to convert." _LOGGER.error(msg + hint) raise ValueError(msg) genome_file = os.path.join( output_dir, "{}.{}.{}".format( organism, genome_assembly, "fa.gz" if file_format == "fasta" else "2bit" ), ) if os.path.exists(genome_file) and (not overwrite): msg = "Genome file already exists and 'overwrite' is set to False." hint = " Returning existing file: {}".format(genome_file) _LOGGER.warning(msg + hint) # even so, if 2bit and FASTA not there try to get fasta if file_format == "2bit": if not os.path.exists(genome_file.replace(".2bit", ".fa")): twobit_to_fasta(genome_file) return genome_file # create .fai index for fasta file if file_format == "fasta": if not dry_run: download_gzip_file(url, genome_file) index_fasta(genome_file) return genome_file else: if not dry_run: download_file(url, genome_file) twobit_to_fasta(genome_file) return genome_file return (url, genome_file)
[docs]def get_blacklist_annotations( organism, genome_assembly=None, output_dir=None, overwrite=True ): """ Get annotations of blacklisted genomic regions for a given organism/genome assembly. Saves results to disk and returns a path to a BED file. Parameters ---------- organism : :obj:`str` Organism to get annotation for. Currently supported: "human" and "mouse". genome_assembly : :obj:`str`, optional Ensembl assembly/version to use. Default for "human" is "hg38/grch38" and for "mouse" is "mm10/grcm38". output_dir : :obj:`str`, optional Directory to write output to. Defaults to "reference" in current directory. overwrite: :obj:`bool`, optional Whether existing files should be overwritten by new ones. Otherwise they will be kept and no action is made. Defaults to :obj:`True`. Returns ------- str Path to blacklist BED file """ from ngs_toolkit.utils import download_gzip_file if output_dir is None: output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference") if not os.path.exists(output_dir): os.makedirs(output_dir) default_organisms = {"human": "hg38", "mouse": "mm10"} if genome_assembly is None: genome_assembly = default_organisms[organism] _LOGGER.warning( "Genome assembly not selected. Using assembly '{}' for '{}'.".format( genome_assembly, organism ) ) output_file = os.path.join( output_dir, "{}.{}.blacklist.bed".format(organism, genome_assembly) ) if os.path.exists(output_file) and (not overwrite): msg = "Annotation file already exists and 'overwrite' is set to False." hint = " Returning existing annotation file: {}".format(output_file) _LOGGER.warning(msg + hint) return output_file url = "http://mitra.stanford.edu/kundaje/akundaje/release/blacklists/" if genome_assembly not in ["hg19"]: url += "{0}-{1}/{0}.blacklist.bed.gz".format(genome_assembly, organism) else: url += "{0}-human/wgEncodeHg19ConsensusSignalArtifactRegions.bed.gz".format( genome_assembly ) try: download_gzip_file(url, output_file) except OSError: msg = "Could not download file: {}".format(url) _LOGGER.error(msg) raise OSError(msg) return output_file
[docs]def get_tss_annotations( organism, genome_assembly=None, save=True, output_dir=None, chr_prefix=True, gene_types=["protein_coding", "processed_transcript", "lincRNA", "antisense"], overwrite=True, ): """ Get annotations of TSS for a given organism/genome assembly. This is a simple approach using Biomart's API querying the Ensembl database. Saves results to disk and returns a dataframe. Parameters ---------- organism : :obj:`str` Organism to get annotation for. Currently supported: "human" and "mouse". genome_assembly : :obj:`str`, optional Ensembl assembly/version to use. Default for "human" is "grch38" and for "mouse" is "grcm38". save: :obj:`bool`, optional Whether to save to disk under ``output_dir``. Defaults to :obj:`True`. output_dir : :obj:`str`, optional Directory to write output to. Defaults to "reference" in current directory. chr_prefix: :obj:`bool`, optional Whether chromosome names should have the "chr" prefix. Defaults to True gene_types : :obj:`list`, optional Subset of transcript biotypes to keep. See here the available biotypes https://www.ensembl.org/Help/Faq?id=468 Defaults to 'protein_coding', 'processed_transcript', 'lincRNA', 'antisense'. overwrite: :obj:`bool`, optional Whether existing files should be overwritten by new ones. Otherwise they will be kept and no action is made. Defaults to :obj:`True`. Returns ------- :obj:`pandas.DataFrame` DataFrame with genome annotations """ default_organisms = { "human": {"species": "hsapiens", "ensembl_version": "grch38"}, "mouse": {"species": "mmusculus", "ensembl_version": "grcm38"}, "yeast": {"species": "scerevisiae", "ensembl_version": "R64"}, } if genome_assembly is None: genome_assembly = default_organisms[organism]["ensembl_version"] if genome_assembly == "hg19": genome_assembly = "grch37" if genome_assembly == "hg38": genome_assembly = "grch38" if genome_assembly == "mm10": genome_assembly = "grcm38" if genome_assembly == "sacCer3": genome_assembly = "R64" output_file = os.path.join( output_dir, "{}.{}.gene_annotation.tss.bed".format(organism, genome_assembly) ) o = get_this_file_or_timestamped(output_file) if os.path.exists(o) and (not overwrite): msg = "Annotation file already exists and 'overwrite' is set to False." hint = " Returning existing annotation from file: {}".format(o) _LOGGER.warning(msg + hint) annot = pd.read_csv(o, header=None, sep="\t") return annot if save: if output_dir is None: output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference") if not os.path.exists(output_dir): os.makedirs(output_dir) attributes = [ "chromosome_name", "start_position", "ensembl_gene_id", "external_gene_name", "strand", "transcript_biotype", ] res = query_biomart( attributes=attributes, species=default_organisms[organism]["species"], ensembl_version=genome_assembly, ) if gene_types is None: res = res.drop(["transcript_biotype"], axis=1).drop_duplicates() else: res = res.loc[res["transcript_biotype"].isin(gene_types), :] if chr_prefix: res.loc[:, "chromosome_name"] = "chr" + res["chromosome_name"] res.loc[:, "start_position"] = res["start_position"].astype(int) res.loc[:, "strand"] = res["strand"].replace("1", "+").replace("-1", "-") res.loc[:, "end"] = res.apply( lambda x: x["start_position"] + 1 if x["strand"] == "+" else x["start_position"], axis=1, ) res.loc[:, "start_position"] = res.apply( lambda x: x["start_position"] - 1 if x["strand"] == "-" else x["start_position"], axis=1, ) # drop same gene duplicates if starting in same position but just annotated with # different biotypes res = ( res[attributes[:2] + ["end"] + attributes[2:]] .sort_values(by=res.columns.tolist(), axis=0) .drop_duplicates(subset=res.columns[:3], keep="last") ) # make real BED format res.loc[:, "fill"] = "." cols = [ "chromosome_name", "start_position", "end", "external_gene_name", "fill", "strand", "ensembl_gene_id", "transcript_biotype", ] res = res.loc[:, cols] res.columns = [ "chr", "start", "end", "gene_name", "score", "strand", "ensembl_gene_id", "transcript_biotype", ] # save if save: res.to_csv(output_file, index=False, header=False, sep="\t") output_file = os.path.join( output_dir, "{}.{}.gene_annotation.protein_coding.tss.bed".format( organism, genome_assembly ), ) res[res["transcript_biotype"] == "protein_coding"].drop( attributes[-1], axis=1 ).to_csv(output_file, index=False, header=False, sep="\t") return res
[docs]def get_genomic_context( organism, genome_assembly=None, save=True, output_dir=None, chr_prefix=True, region_subset=[ "promoter", "exon", "5utr", "3utr", "intron", "genebody", "intergenic", ], gene_types=["protein_coding", "processed_transcript", "lincRNA", "antisense"], promoter_width=3000, overwrite=True, ): """ Get annotations of TSS for a given organism/genome assembly. This is a simple approach using Biomart's API querying the Ensembl database. Saves results to disk and returns a dataframe. The API call to BioMart can take a bit, so the function should take ~4 min for a human genome. Parameters ---------- organism : :obj:`str` Organism to get annotation for. Currently supported: "human" and "mouse". genome_assembly : :obj:`str`, optional Ensembl assembly/version to use. Default for "human" is "grch38" and for "mouse" is "grcm38". save: :obj:`bool`, optional Whether to save to disk under ``output_dir``. Defaults to :obj:`True`. output_dir : :obj:`str`, optional Directory to write output to. Defaults to "reference" in current directory. chr_prefix: :obj:`bool`, optional Whether chromosome names should have the "chr" prefix. Defaults to True gene_types : :obj:`list`, optional Subset of transcript biotypes to keep. See here the available biotypes https://www.ensembl.org/Help/Faq?id=468 Defaults to 'protein_coding', 'processed_transcript', 'lincRNA', 'antisense'. overwrite: :obj:`bool`, optional Whether existing files should be overwritten by new ones. Otherwise they will be kept and no action is made. Defaults to :obj:`True`. Returns ------- :obj:`pandas.DataFrame` DataFrame with genome annotations """ from pybedtools import BedTool import pybedtools default_organisms = { "human": { "species": "hsapiens", "ensembl_version": "grch38", "ucsc_version": "hg38", }, "mouse": { "species": "mmusculus", "ensembl_version": "grcm38", "ucsc_version": "mm10", }, "yeast": {"species": "scerevisiae", "ensembl_version": "R64"}, } if genome_assembly is None: genome_assembly = default_organisms[organism]["ucsc_version"] ensembl_genome_assembly = default_organisms[organism]["ensembl_version"] elif genome_assembly == "hg19": ensembl_genome_assembly = "grch37" elif genome_assembly == "hg38": ensembl_genome_assembly = "grch38" elif genome_assembly == "mm10": ensembl_genome_assembly = "grcm38" elif genome_assembly == "sacCer3": ensembl_genome_assembly = "R64" else: _LOGGER.warning() ensembl_genome_assembly = genome_assembly output_file = os.path.join( output_dir, "{}.{}.genomic_context.bed".format(organism, ensembl_genome_assembly)) o = get_this_file_or_timestamped(output_file) if os.path.exists(o) and (not overwrite): msg = "Annotation file already exists and 'overwrite' is set to False." hint = " Returning existing annotation from file: {}".format(output_file) _LOGGER.warning(msg + hint) annot = pd.read_csv(output_file, header=None, sep="\t") return annot if save: if output_dir is None: output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference") if not os.path.exists(output_dir): os.makedirs(output_dir) attrs = [ "ensembl_gene_id", "chromosome_name", "start_position", "end_position", "exon_chrom_start", "exon_chrom_end", "5_utr_start", "5_utr_end", "3_utr_start", "3_utr_end", "strand", "external_gene_name", "gene_biotype", ] res = query_biomart( attributes=attrs, species=default_organisms[organism]["species"], ensembl_version=ensembl_genome_assembly, ) if chr_prefix: res["chromosome_name"] = "chr" + res["chromosome_name"] # Keep only desired gene types res = res.loc[res["gene_biotype"].isin(gene_types), :] # Get strand res.loc[:, "strand"] = res["strand"].replace("1", "+").replace("-1", "-") # remove exons which are also UTRs bed_cols = ["chromosome_name", "start_position", "end_position", "strand"] # # Promoters = genebody:start +/- N promoter = res[bed_cols].drop_duplicates() for col in promoter.columns: if ("start" in col) or ("end" in col): promoter.loc[:, col] = promoter.loc[:, col].astype(int) promoter.loc[promoter["strand"] == "+", "start_position"] = promoter.loc[ promoter["strand"] == "+", "start_position" ] - (promoter_width / 2) promoter.loc[promoter["strand"] == "-", "start_position"] = promoter.loc[ promoter["strand"] == "-", "end_position" ] - ( promoter_width / 2 ) # + 1 promoter.loc[promoter["strand"] == "+", "end_position"] = promoter.loc[ :, "start_position" ] + (promoter_width / 2) for col in bed_cols[1:3]: promoter.loc[:, col] = promoter.loc[:, col].astype(int) # # # fix where promoter start < 0 promoter.loc[promoter["start_position"] < 0, "start_position"] = 0 # # # remove end < start promoter = promoter.loc[~(promoter["start_position"] > promoter["end_position"])] # # genebody = start->end + promoter gb = res[bed_cols].drop_duplicates() for col in gb.columns: if ("start" in col) or ("end" in col): gb.loc[:, col] = gb.loc[:, col].astype(int) gb = gb.append(promoter) # for col in bed_cols[1:3]: # gb.loc[:, col] = gb.loc[:, col].astype(int) gb = gb.sort_values(gb.columns.tolist()) genebody_bed = BedTool.from_dataframe(gb) genebody_bed = genebody_bed.sort().merge() genebody = genebody_bed.to_dataframe() # # Exon exon = ( res[["chromosome_name", "exon_chrom_start", "exon_chrom_end"]] .drop_duplicates() .dropna() ) # # 5utr utr5 = ( res[["chromosome_name", "5_utr_start", "5_utr_end"]].drop_duplicates().dropna() ) # # 3utr utr3 = ( res[["chromosome_name", "3_utr_start", "3_utr_end"]].drop_duplicates().dropna() ) for d in [exon, utr5, utr3]: for col in d.columns: if ("start" in col) or ("end" in col): d.loc[:, col] = d.loc[:, col].astype(int) # # Introns = genebody - (promoter + exon + 5utr + 3utr) promoter = promoter.drop(["strand"], axis=1) exon.columns = utr3.columns = utr5.columns = promoter.columns = bed_cols[:3] others = promoter.append(exon).append(utr5).append(utr3) for col in others.columns[1:]: others.loc[:, col] = others.loc[:, col].astype(int) intron = genebody_bed.subtract(BedTool.from_dataframe(others)) intron = intron.to_dataframe() # # Intergenic = ChromSizes - genebody cs = pybedtools.get_chromsizes_from_ucsc(genome_assembly) cs = pd.DataFrame(cs).T.reset_index() if not chr_prefix: cs["index"] = cs["index"].str.subtract("chr", "") cs = BedTool.from_dataframe(cs) intergenic = cs.subtract(genebody_bed) intergenic = intergenic.to_dataframe() # join all features = ["promoter", "genebody", "exon", "intron", "utr5", "utr3", "intergenic"] annot = list() for name in features: cur = locals()[name] cur = cur.iloc[:, list(range(3))] cur.columns = bed_cols[:3] cur.loc[:, "region_type"] = name if save: cur.sort_values(cur.columns.tolist()).drop("region_type", axis=1).to_csv( os.path.join( output_dir, "{}.{}.genomic_context.{}.bed".format( organism, ensembl_genome_assembly, name ), ), index=False, header=False, sep="\t", ) annot.append(cur) annot = pd.concat(annot) annot = annot.sort_values(annot.columns.tolist()) # save if save: annot.to_csv(output_file, index=False, header=False, sep="\t") return annot
[docs]def get_chromosome_sizes( organism, genome_assembly=None, output_dir=None, overwrite=True ): """ Get a file with the sizes of chromosomes in a given organism/genome assembly. Saves results to disk and returns a path to a text file. Parameters ---------- organism : :obj:`str` Organism to get chromosome sizes for. Currently supported: "human" and "mouse". genome_assembly : :obj:`str`, optional Ensembl assembly/version to use. Default for "human" is "hg38/grch38" and for "mouse" is "mm10/grcm38". output_dir : :obj:`str`, optional Directory to write output to. Defaults to "reference" in current directory. overwrite: :obj:`bool`, optional Whether existing files should be overwritten by new ones. Otherwise they will be kept and no action is made. Defaults to :obj:`True`. Returns ------- str Path to text file with chromosome sizes. """ import pybedtools if output_dir is None: output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference") if not os.path.exists(output_dir): os.makedirs(output_dir) default_organisms = {"human": "hg38", "mouse": "mm10"} if genome_assembly is None: genome_assembly = default_organisms[organism] _LOGGER.warning( "Genome assembly not selected. Using assembly '{}' for '{}'.".format( genome_assembly, organism ) ) output_file = os.path.join( output_dir, "{}.{}.chromosome_sizes.txt".format(organism, genome_assembly) ) if os.path.exists(output_file) and (not overwrite): msg = "Annotation file already exists and 'overwrite' is set to False." hint = " Returning existing annotation file: {}".format(output_file) _LOGGER.warning(msg + hint) return output_file sizes = pybedtools.get_chromsizes_from_ucsc(genome_assembly) with open(output_file, "w") as handle: for chrom, (_, size) in sizes.items(): handle.write("{}\t{}\n".format(chrom, size)) return output_file
[docs]def deseq_analysis( count_matrix, experiment_matrix, comparison_table, formula, output_dir, output_prefix, overwrite=True, alpha=0.05, independent_filtering=False, create_subdirectories=True, save_inputs=True, **kwargs ): """ Perform differential comparison analysis with DESeq2. .. note:: Do not include hyphens ("-") in any of the samples or groups names! R freaks out with this. # TODO: fix hyphens in names issue Parameters ---------- count_matrix : :obj:`pandas.DataFrame` Data frame of shape (samples, variables) with raw read counts. experiment_matrix : :obj:`pandas.DataFrame` Data frame with columns "sample_name" and any other variables used in the `formula`. comparison_table : :obj:`pandas.DataFrame` Data frame with columns "comparison_name", "sample_group" and sample_name". formula : :obj:`str` Formula to test in R/patsy notation. Usually something like "~ batch + group". output_dir : :obj:`str` Output directory for produced files. output_prefix : :obj:`str` Prefix to add to produced files. overwrite: :obj:`bool`, optional Whether files existing should be overwritten. Defaults to :obj:`True`. alpha : number, optional Significance level to reject null hypothesis. This in practice has no effect as results for all features will be returned. Defaults to 0.05. create_subdirectories: :obj:`bool` Whether to create subdirectories for the result of each comparison. **kwargs: :obj:`dict` Additional keyword arguments to be passed to the DESeq function of DESeq2. Returns ------- :obj:`pandas.DataFrame` Data frame with results, statistics for each feature. """ from tqdm import tqdm from ngs_toolkit.utils import r2pandas_df, recarray2pandas_df from rpy2.robjects import numpy2ri, pandas2ri, r from rpy2.robjects.packages import importr numpy2ri.activate() pandas2ri.activate() importr("DESeq2") # order experiment and count matrices in same way experiment_matrix = experiment_matrix.set_index("sample_name").loc[ count_matrix.columns, : ] # save the matrices just in case if save_inputs: count_matrix.to_csv( os.path.join(output_dir, output_prefix + ".count_matrix.tsv"), sep="\t" ) experiment_matrix.to_csv( os.path.join(output_dir, output_prefix + ".experiment_matrix.tsv"), sep="\t" ) comparison_table.to_csv( os.path.join(output_dir, output_prefix + ".comparison_table.tsv"), sep="\t" ) # Rename samples to avoid R errors with sample names containing symbols count_matrix.columns = ["S{}".format(i) for i in range(len(count_matrix.columns))] experiment_matrix.index = [ "S{}".format(i) for i in range(len(experiment_matrix.index)) ] # Replace hyphens with underscores experiment_matrix = experiment_matrix.replace("-", "_") comparison_table = comparison_table.replace("-", "_") # Run DESeq analysis dds = r.DESeqDataSetFromMatrix( countData=count_matrix.astype(int), colData=experiment_matrix, design=r("as.formula")(formula), ) dds = r.DESeq(dds, parallel=True, **kwargs) # _save(dds, file=os.path.join(output_dir, output_prefix + ".deseq_dds_object.Rdata")) results = list() comps = comparison_table["comparison_name"].drop_duplicates().sort_values() for comp in tqdm(comps, total=len(comps), desc="Comparison"): if create_subdirectories: if not os.path.exists(os.path.join(output_dir, comp)): os.makedirs(os.path.join(output_dir, comp)) out_file = os.path.join( output_dir, comp, output_prefix + ".deseq_result.{}.csv".format(comp) ) else: out_file = os.path.join( output_dir, output_prefix + ".deseq_result.{}.csv".format(comp) ) if not overwrite and os.path.exists(get_this_file_or_timestamped(out_file)): continue _LOGGER.info("Doing comparison '{}'".format(comp)) a = ( comparison_table.loc[ (comparison_table["comparison_name"] == comp) & (comparison_table["comparison_side"] >= 1), "sample_group", ] .drop_duplicates() .squeeze() ) b = ( comparison_table.loc[ (comparison_table["comparison_name"] == comp) & (comparison_table["comparison_side"] <= 0), "sample_group", ] .drop_duplicates() .squeeze() ) contrast = np.array(["sample_group", a, b]) try: res = r("as.data.frame")( r.results( dds, contrast=contrast, alpha=alpha, independentFiltering=independent_filtering, parallel=True, ) ) except Exception as e: _LOGGER.warning("DESeq2 group contrast '{}' didn't work!".format(contrast)) _LOGGER.error(e) contrast = ["sample_group" + a, "sample_group" + b] try: res = r("as.data.frame")( r.results( dds, contrast=contrast, alpha=alpha, independentFiltering=independent_filtering, parallel=True, ) ) _LOGGER.warning( "DESeq2 group contrast '{}' did work now!".format( ", ".join(contrast) ) ) except Exception as e2: _LOGGER.warning( "DESeq2 group contrast '{}' didn't work either!".format( ", ".join(contrast) ) ) _LOGGER.error(e2) raise e2 if isinstance(res, np.recarray): res = recarray2pandas_df(res) res.index = count_matrix.index if not isinstance(res, pd.DataFrame): # convert to pandas dataframe res = r2pandas_df(res) res.loc[:, "comparison_name"] = comp res.index.name = "index" # save res.sort_values("pvalue").to_csv(out_file) # append results.append(res) # save all results = pd.concat(results) results.index.name = "index" results.to_csv( os.path.join(output_dir, output_prefix + ".deseq_result.all_comparisons.csv"), index=True, ) # return return results
[docs]def least_squares_fit( matrix, design_matrix, test_model, null_model="~ 1", standardize_data=True, multiple_correction_method="fdr_bh", ): """ Fit a least squares model with only categorical predictors. Computes p-values by comparing the log likelihood ratio of the chosen model to a `null_model`. Parameters ---------- matrix : :obj:`pandas.DataFrame` A Data frame of shape (samples, variables). design_matrix : :obj:`pandas.DataFrame` A Data frame of shape (samples, variables) with all the variables in `test_model`. test_model : :obj:`str` Model design to test in R/patsy notation. null_model : :obj:`str`, optional Null model design in R/patsy notation. Defaults to "~ 1". standardize_data: :obj:`bool`, optional Whether data should be standardized prior to fitting. Defaults to :obj:`True`. multiple_correction_method : :obj:`str`, optional Method to use for multiple test correction. See statsmodels.sandbox.stats.multicomp.multipletests. Defaults to "fdr_bh". Returns ------- :obj:`pandas.DataFrame` Statistics of model fitting and comparison between models for each feature. :Example: matrix = np.random.random(10000000).reshape(100, 100000) P = np.concatenate([[0] * 50, [1] * 50]) # dependent variable Q = np.concatenate([[0] * 25, [1] * 25] + [[0] * 25, [1] * 25]) # covariate design_matrix = pd.DataFrame([P, Q], index=["P", "Q"]).T matrix = matrix.T * ((1 + design_matrix.sum(axis=1)) * 4).values matrix = pd.DataFrame(matrix.T) test_model = "~ Q + P" null_model = "~ Q" res = least_squares_fit(matrix, design_matrix, test_model, null_model) res.head() """ import patsy from scipy import stats from scipy.linalg import lstsq from sklearn.preprocessing import StandardScaler from statsmodels.sandbox.stats.multicomp import multipletests if standardize_data: norm = StandardScaler() matrix = pd.DataFrame( norm.fit_transform(matrix), index=matrix.index, columns=matrix.columns ) a1 = patsy.dmatrix(test_model, design_matrix) betas1, residuals1, _, _ = lstsq(a1, matrix) a0 = patsy.dmatrix(null_model, design_matrix) betas0, residuals0, _, _ = lstsq(a0, matrix) results = pd.DataFrame( betas1.T, columns=a1.design_info.column_names, index=matrix.columns ) # Calculate the log-likelihood ratios n = float(matrix.shape[0]) results["model_residuals"] = residuals1 results["null_residuals"] = residuals0 results["model_log_likelihood"] = ( (-n / 2.0) * np.log(2 * np.pi) - n / 2.0 * np.log(results["model_residuals"] / n) - n / 2.0 ) results["null_log_likelihood"] = ( (-n / 2.0) * np.log(2 * np.pi) - n / 2.0 * np.log(results["null_residuals"] / n) - n / 2.0 ) results["log_likelihood_ratio"] = ( results["model_log_likelihood"] - results["null_log_likelihood"] ) results["D_statistic"] = 2 * results["log_likelihood_ratio"] results["p_value"] = stats.chi2.sf( results["log_likelihood_ratio"], df=betas1.shape[0] - betas0.shape[0] ) results["q_value"] = multipletests( results["p_value"], method=multiple_correction_method )[1] if not standardize_data: results["mean"] = matrix.mean(axis=0) return results
[docs]def differential_from_bivariate_fit( comparison_table, matrix, output_dir, output_prefix, n_bins=250, multiple_correction_method="fdr_bh", plot=True, palette="colorblind", make_values_positive=False, ): """ Perform differential analysis using a bivariate gaussian fit on the relationship between mean and fold-change for each comparison. Parameters ---------- comparison_table : :obj:`pandas.DataFrame` Dataframe with 'comparison_name', 'comparison_side' and 'sample_name', 'sample_group' columns. matrix : :obj:`pandas.DataFrame` Matrix of `n_features, n_samples` with normalized, log-transformed values to perform analysis on. output_dir : :obj:`str` Output directory output_prefix : :obj:`str` Prefix for outputs. n_bins : :obj:`int` Number of bins of mean values along which to standardize fold-changes. multiple_correction_method : :obj:`str` Multiple correction method from `statsmodels.sandbox.stats.multicomp.multipletests`. plot: :obj:`bool` Whether to generate plots. palette : :obj:`str` Color palette to use. This can be any matplotlib palette and is passed to `sns.color_palette`. make_values_positive: :obj:`bool` Whether to transform `matrix` to have minimum value 0. Default False. Returns ------- :obj:`pandas.DataFrame` Results of fitting and comparison between groups for each feature. """ import matplotlib.pyplot as plt from ngs_toolkit.graphics import savefig from scipy.stats import gaussian_kde import seaborn as sns from statsmodels.sandbox.stats.multicomp import multipletests comparisons = comparison_table["comparison_name"].drop_duplicates().sort_values() if plot: fig, axis = plt.subplots( 2, len(comparisons), figsize=(4 * len(comparisons), 4 * 2), sharex=True, sharey="row", ) if make_values_positive: matrix = matrix + abs(matrix.min().min()) results = pd.DataFrame() for i, comparison in enumerate(comparisons): _LOGGER.info("Doing comparison '{}'".format(comparison)) out_file = os.path.join( output_dir, output_prefix + ".fit_result.{}.csv".format(comparison) ) sa = comparison_table.loc[ (comparison_table["comparison_name"] == comparison) & (comparison_table["comparison_side"] == 1), "sample_name", ] ga = comparison_table.loc[ (comparison_table["comparison_name"] == comparison) & (comparison_table["comparison_side"] == 1), "sample_group", ].squeeze() sb = comparison_table.loc[ (comparison_table["comparison_name"] == comparison) & (comparison_table["comparison_side"] == 0), "sample_name", ] gb = comparison_table.loc[ (comparison_table["comparison_name"] == comparison) & (comparison_table["comparison_side"] == 0), "sample_group", ].squeeze() a = matrix.loc[:, sa].mean(axis=1) a.name = ga b = matrix.loc[:, sb].mean(axis=1) b.name = gb # assemble stats res = a.to_frame() res = res.join(b) res["global_mean"] = matrix.mean(axis=1) res["global_std"] = matrix.std(axis=1) res["comparison_mean"] = res.mean(axis=1) res["comparison_std"] = res.mean(axis=1) res["log2FoldChange"] = np.log2(res[ga] / res[gb]) res["comparison_name"] = comparison # standardize fold change bounds = np.linspace(0, res["comparison_mean"].max(), n_bins) for (start, end) in zip(bounds[:-2], bounds[1:-1]): r = res.loc[ (res["comparison_mean"] > start) & (res["comparison_mean"] < end) ].index v = res.loc[r, "log2FoldChange"] res.loc[r, "norm_log2FoldChange"] = (v - np.nanmean(v)) / np.nanstd(v) # let's try a bivariate gaussian kernel # separately for positive and negative to avoid biases in center of mass kernel = gaussian_kde( res.loc[ res["norm_log2FoldChange"] > 0, ["comparison_mean", "norm_log2FoldChange"], ].T.values ) res.loc[res["norm_log2FoldChange"] > 0, "density"] = kernel( res.loc[ res["norm_log2FoldChange"] > 0, ["comparison_mean", "norm_log2FoldChange"], ].T.values ) kernel = gaussian_kde( res.loc[ res["norm_log2FoldChange"] <= 0, ["comparison_mean", "norm_log2FoldChange"], ].T.values ) res.loc[res["norm_log2FoldChange"] <= 0, "density"] = kernel( res.loc[ res["norm_log2FoldChange"] <= 0, ["comparison_mean", "norm_log2FoldChange"], ].T.values ) # Let's calculate something like an empirical p-value on the density res["pvalue"] = (res["density"] - res["density"].min()) / ( res["density"].max() - res["density"].min() ) res["padj"] = multipletests( res["pvalue"].fillna(1), method=multiple_correction_method )[1] res["direction"] = (res["norm_log2FoldChange"] >= 0).astype(int).replace(0, -1) res.to_csv(out_file) if plot: axis[0, i].scatter( res["comparison_mean"], res["log2FoldChange"], alpha=0.2, s=5, color=sns.color_palette(palette)[0], rasterized=True, ) axis[0, i].axhline(0, color="black", linestyle="--") axis[1, i].scatter( res["comparison_mean"], res["norm_log2FoldChange"], alpha=0.2, s=5, color=sns.color_palette(palette)[0], rasterized=True, ) diff = res.loc[ (res["pvalue"] < 0.05) & (res["norm_log2FoldChange"].abs() >= 2), : ].index axis[0, i].scatter( res.loc[diff, "comparison_mean"], res.loc[diff, "log2FoldChange"], alpha=0.2, s=5, color=sns.color_palette(palette)[1], rasterized=True, ) axis[1, i].scatter( res.loc[diff, "comparison_mean"], res.loc[diff, "norm_log2FoldChange"], alpha=0.2, s=5, color=sns.color_palette(palette)[1], rasterized=True, ) axis[1, i].axhline(0, color="black", linestyle="--") axis[0, i].set_title(comparison + "\n" + ga + " vs " + gb) axis[1, i].set_xlabel("Comparison mean") if i == 0: axis[0, i].set_ylabel("log2 fold-change") axis[1, i].set_ylabel("Norm(log2 fold-change)") results = results.append(res.reset_index(), ignore_index=True) # save figure savefig( fig, os.path.join( output_dir, output_prefix + ".deseq_result.all_comparisons.scatter.svg" ), ) # save all results = results.set_index("index") results.to_csv( os.path.join(output_dir, output_prefix + ".deseq_result.all_comparisons.csv"), index=True, ) return results
# def independent_filtering(df, alpha=0.05, n_quantiles=100): # """ # """ # req_columns = ["pvalue", "baseMean"] # assert all([x in df.columns for x in req_columns]) # # compute quantiles accross mean and pvalue distributions # stats = pd.DataFrame() # p = (np.arange(n_quantiles) / float(n_quantiles)) * 100 # p = np.append(p, 100.) # for start, end in zip(p, p[1:]): # m = np.log2(1 + df['baseMean']) # i = df.loc[ # (m >= np.percentile(m, start)) & # (m <= np.percentile(m, end)), :].index # stats.loc[start, "n"] = i.shape[0] # stats.loc[start, "mean"] = df.loc[i, "baseMean"].mean() # stats.loc[start, "mean_p"] = df.loc[i, "pvalue"].mean() # stats.loc[start, "n_sig_p"] = (df.loc[i, "pvalue"] < alpha).sum() # # plot # fig, axis = plt.subplots(1, 2, figsize=(4 * 2, 4 * 1)) # axis[0].scatter(stats.index, stats.loc[:, "n_sig_p"]) # axis[1].scatter(stats.index, -np.log10(stats.loc[:, "mean_p"])) # # choose inflection point # return # def fit_curve(): # def func(x, a, b, c): # return a * np.exp(-b * x) + c # plt.plot(stats.index, -np.log10(stats['mean_p']), 'b-', label='data') # popt, pcov = curve_fit(func, stats.index, -np.log10(stats['mean_p'])) # plt.plot(stats.index, func(stats.index, *popt), 'r-', label='fit') # popt, pcov = curve_fit(func, stats.index, stats['mean_p'], bounds=(0, [3., 2., 1.])) # plt.plot(stats.index, func(stats.index, *popt), 'g--', label='fit-with-bounds') # plt.xlabel('x') # plt.ylabel('y') # plt.legend() # plt.show()
[docs]def lola( bed_files, universe_file, output_folder, genome, output_prefixes=None, cpus=8): """ Perform location overlap analysis (LOLA). If bed_files is a list with more than one element, use ``output_prefixes`` to pass a list of prefixes to label the output files for each input BED file. Files will be created in ``output_folder`` mimicking the output that the R function LOLA::writeCombinedEnrichment writes. Requires the R package "LOLA" to be installed: .. highlight:: R .. code-block:: R if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("LOLA") Parameters ---------- bed_files : {str, list} A string path to a BED file or a list of paths. universe_file : :obj:`str` A path to a BED file representing the universe from where the BED file(s) come from. output_folder : :obj:`str` Output folder for resulting files. genome : :obj:`str`, optional Genome assembly from which the BED files come from. This is used to get the LOLA databases from the :obj:`ngs_toolkit._CONFIG` parameters. output_prefixes : :obj:`list`, optional A list of strings with prefixes to be used in case ``bed_files`` is a list. cpus : :obj:`int`, optional Number of CPUs/threads to use. Defaults to 8. """ from ngs_toolkit.utils import r2pandas_df from rpy2.robjects import numpy2ri, pandas2ri, r from rpy2.robjects.packages import importr numpy2ri.activate() pandas2ri.activate() importr("LOLA") # Get region databases from config _LOGGER.info( "Getting LOLA databases for genome '%s' from configuration.", genome) msg = ( "LOLA database values in configuration could not be found or understood. " "Please add a list of value(s) to this section " "'resources:lola:region_databases:%s'. " "For an example, see " "https://github.com/afrendeiro/toolkit/tree/master/ngs_toolkit/config/example.yaml") try: databases = _CONFIG["resources"]["lola"]["region_databases"][genome] except KeyError: _LOGGER.error(msg, genome) raise if not isinstance(databases, list): if isinstance(databases, str): databases = list(databases) else: _LOGGER.error(msg, genome) raise KeyError(msg % genome) if len(databases) < 1: _LOGGER.error(msg) raise KeyError(msg) if isinstance(bed_files, str): bed_files = [bed_files] if output_prefixes is None: if len(bed_files) > 1: msg = ( "Running more than one BED file at once while only specifying " "`output_folder` argument will cause output files to be named " "in the form " "'{output_folder}/{region_database}.{input_file}.tsv'." " To prevent this behaviour, pass a list of arguments to " "`output_prefixes`.") _LOGGER.warning(msg) output_prefixes = [ rr.replace(os.path.sep, "__").replace(".bed", ".") for rr in bed_files ] else: output_prefixes = ["."] _LOGGER.info("Reading up universe file '{}'.".format(universe_file)) universe = r("LOLA::readBed")(universe_file) _LOGGER.info("Loading region set databases.") _regionDB = r("LOLA::loadRegionDB")(np.array(databases)) for suffix, bed_file in zip(output_prefixes, bed_files): _LOGGER.info("Reading up BED file '{}'.".format(bed_file)) user_set = r("LOLA::readBed")(bed_file) _LOGGER.info("Running LOLA testing for file '{}'.".format(bed_file)) _lola_results = r("LOLA::runLOLA")(user_set, universe, _regionDB, cores=cpus) _LOGGER.info("Converting results from R to Python") lola_results = r2pandas_df(_lola_results) _LOGGER.info("Saving all results for file '{}'.".format(bed_file)) lola_results.to_csv( os.path.join(output_folder, "allEnrichments" + suffix + "tsv"), index=False, sep="\t", ) for region_set in lola_results["collection"].drop_duplicates(): _LOGGER.info("Saving results for collection '%s' only.", region_set) lola_results[lola_results["collection"] == region_set].to_csv( os.path.join(output_folder, "col_" + region_set + suffix + "tsv"), index=False, sep="\t", )
def meme_ame( input_fasta, output_dir, background_fasta=None, organism="human", motif_database_file=None, ): import subprocess if motif_database_file is None: # Get region databases from config _LOGGER.info( "Getting 2bit reference genome for genome '%s' from configuration.", organism ) msg = ( "Reference genome in 2bit format value in configuration could not" "be found or understood. " "Please add a list of value(s) to this section " "'resources:meme:motif_databases:%s'. " "For an example, see " "https://github.com/afrendeiro/toolkit/tree/master/ngs_toolkit/config/example.yaml") try: motif_database_file = _CONFIG["resources"]["meme"]["motif_databases"][ organism ] except KeyError: _LOGGER.error(msg, organism) return if not isinstance(motif_database_file, str): _LOGGER.error(msg, organism) return # shuffle input in no background is provided if background_fasta is None: shuffled = input_fasta + ".shuffled" cmd = """ fasta-dinucleotide-shuffle -c 1 -f {0} > {1} """.format( input_fasta, shuffled ) subprocess.call(cmd.split(" ")) cmd = ( "ame --bgformat 1 --scoring avg --method ranksum " "--pvalue-report-threshold 0.05 --control {0} -o {1} {2} {3}" ).format( background_fasta if background_fasta is not None else shuffled, output_dir, input_fasta, motif_database_file, ) subprocess.call(cmd.split(" ")) # subprocess.call("rm {}".format(shuffled).split(" ")) def homer_motifs(bed_file, output_dir, genome_assembly): import subprocess cmd = "findMotifsGenome.pl {bed} {genome}r {out_dir} \ -size 1000 -h -p 2 -len 8,10,12,14 -noknown".format( bed=bed_file, genome=genome_assembly, out_dir=output_dir ) subprocess.call(cmd.split(" "))
[docs]def homer_combine_motifs( comparison_dirs, output_dir, region_prefix="differential_analysis", reduce_threshold=0.6, match_threshold=10, info_value=0.6, p_value_threshold=1e-25, fold_enrichment=None, cpus=8, run=True, distributed=True, genome="hg38", motif_database=None, ): """ Create consensus of de novo discovered motifs from HOMER Parameters ---------- comparison_dirs : :obj:`list` Iterable of comparison directories where homer was run. Should contain a "homerMotifs.all.motifs" file. output_dir : :obj:`str` Output directory. p_value_threshold : number, optional Threshold for inclusion of a motif in the consensus set. Defaults to 1e-5 cpus : number, optional Number of available CPUS/threads for multithread processing. Defaults to 8 run: :obj:`bool`, optional Whether to run enrichment of each comparison in the consensus motifs. Default is True distributed: :obj:`bool`, optional Whether to run enrichment as a cluster job. Default is True genome : :obj:`str` Genome assembly of the data. Default is 'hg38'. motif_database : :obj:`str` Motif database to restrict motif matching too. Returns ------- {str,None} If `run` is `False`, returns path to consensus motif file. Otherwise `None`. """ from glob import glob import subprocess if not os.path.exists(output_dir): os.makedirs(output_dir) # concatenate files out_file = os.path.join(output_dir, "homerMotifs.combined.motifs") with open(out_file, "a") as outfile: for dir_ in comparison_dirs: with open(os.path.join(dir_, "homerMotifs.all.motifs"), "r") as infile: for line in infile: outfile.write(line) # Filter and get motif consensus extra = "" if motif_database: extra = " -known {}".format(motif_database) if fold_enrichment is None: fold_enrichment = "" else: fold_enrichment = " -F " + str(fold_enrichment) subprocess.call( "compareMotifs.pl {} {} -reduceThresh {} -matchThresh {}{} -pvalue {} -info {}{} -nofacts -cpu {}".format( out_file, output_dir, reduce_threshold, match_threshold, extra, p_value_threshold, info_value, fold_enrichment, cpus, ).split( " " ) ) # concatenate consensus motif files files = glob(os.path.join(output_dir, "homerResults/*motif")) combined_motifs = os.path.join(output_dir, "homerMotifs.filtered.motifs") with open(combined_motifs, "w") as outfile: for f in files: if ("similar" in f) or ("RV" in f): continue _LOGGER.info(f) with open(f, "r") as infile: for line in infile: outfile.write(line) if run: for dir_ in comparison_dirs: # delete previous results if existing results = os.path.join(dir_, "knownResults.txt") if os.path.exists(results): _LOGGER.warning( "Deleting previously existing results file: '{}'".format(results) ) os.remove(results) # prepare enrichment command with consensus set cmd = "findMotifsGenome.pl {bed} {genome}r {dir} -p {cpus} -nomotif -mknown {motif_file}".format( bed=os.path.join(dir_, region_prefix + "_regions.bed"), genome=genome, cpus=cpus, dir=dir_, motif_file=combined_motifs, ) # run if distributed: subprocess.call( "sbatch -J homer.{d} -o {dir}.homer.log -p shortq -c 8 --mem 20000".format( d=os.path.basename(dir_), dir=dir_ ).split( " " ) + ["--wrap", cmd] ) else: subprocess.call(cmd.split(" "))
[docs]def enrichr(dataframe, gene_set_libraries=None, kind="genes", max_attempts=5): """ Use Enrichr on a list of genes (currently only genes supported through the API). Parameters ---------- dataframe : :obj:`str` DataFrame with column "gene_name". gene_set_libraries : :obj:`list`, optional Gene set libraries to use. Defaults to values in initial configuration file. To see them, do: ``ngs_toolkit._CONFIG['resources']['enrichr']['gene_set_libraries']`` kind : :obj:`str`, optional Type of input. Right now, only "genes" is supported. Defaults to "genes" max_attempts : :obj:`int`, optional Number of times to try a call to Enrichr API. Defaults to 5 Returns ------- :obj:`pandas.DataFrame` Results of enrichment analysis Raises ------- Exception If `max_attempts` is exceeded """ import json import requests from tqdm import tqdm ENRICHR_ADD = "http://amp.pharm.mssm.edu/Enrichr/addList" ENRICHR_RETRIEVE = "http://amp.pharm.mssm.edu/Enrichr/enrich" query_string = "?userListId={}&backgroundType={}" if gene_set_libraries is None: # Get region databases from config _LOGGER.debug("Getting Enrichr gene set libraries from configuration.") msg = "Enrichr gene set libraries value in configuration could not be found or understood. " msg += "Please add a list of value(s) to this section 'resources:mem:motif_databases'. " msg += "For an example, see https://github.com/afrendeiro/toolkit/tree/master/ngs_toolkit/config/example.yaml" try: gene_set_libraries = _CONFIG["resources"]["enrichr"]["gene_set_libraries"] except KeyError: _LOGGER.error(msg) return if not isinstance(gene_set_libraries, list): if isinstance(gene_set_libraries, str): gene_set_libraries = list(gene_set_libraries) else: _LOGGER.error(msg) return if len(gene_set_libraries) < 1: _LOGGER.error(msg) return if kind == "genes": # Build payload with bed file attr = "\n".join(dataframe["gene_name"].dropna().tolist()) elif kind == "regions": # Build payload with bed file attr = "\n".join( dataframe[["chrom", "start", "end"]] .apply(lambda x: "\t".join([str(i) for i in x]), axis=1) .tolist() ) payload = {"list": (None, attr), "description": (None, "")} # Request adding gene set response = requests.post(ENRICHR_ADD, files=payload) if not response.ok: raise Exception("Error analyzing gene list") # Track gene set ID user_list_id = json.loads(response.text)["userListId"] results = pd.DataFrame() for gene_set_library in tqdm( gene_set_libraries, total=len(gene_set_libraries), desc="Gene set library" ): _LOGGER.debug("Using Enricher on {} gene set library.".format(gene_set_library)) # Request enriched sets in gene set i = 0 okay = False while not okay: if i == max_attempts: _LOGGER.error( "API request failed {} times, exceeding `max_attempts`.".format(i) ) raise Exception("Fetching enrichment results maxed `max_attempts`.") response = requests.get( ENRICHR_RETRIEVE + query_string.format(user_list_id, gene_set_library) ) okay = response.ok if not okay: _LOGGER.warning("API request failed. Retrying.") # Get enriched sets in gene set res = json.loads(response.text) # If there's no enrichemnt, continue if len(res) < 0: continue # Put in dataframe res = pd.DataFrame([pd.Series(s) for s in res[gene_set_library]]) if res.shape[0] == 0: continue cols = [ "rank", "description", "p_value", "z_score", "combined_score", "genes", "adjusted_p_value", ] if len(res.columns) == 7: res.columns = cols elif len(res.columns) == 9: res.columns = cols + ["old_p_value", "old_adjusted_p_value"] # Remember gene set library used res["gene_set_library"] = gene_set_library # Append to master dataframe results = results.append(res, ignore_index=True) return results
[docs]def run_enrichment_jobs( results_dir, genome, background_bed, steps=["lola", "meme", "homer", "enrichr"], overwrite=True, pep_config=None, ): """ Submit parallel enrichment jobs for a specific analysis. Parameters ---------- :param results_dir: Directory with files prepared by ngs_toolkit.general.run_enrichment_jobs :param genome: Genome assembly of the analysis. background_bed : :obj:`str` BED file to use as background for LOLA analysis. Typically the analysis' own consensus region set. steps : :obj:`list`, optional Steps of the analysis to perform. Defaults to ["region", lola", "meme", "homer", "enrichr"]. :param overwrite: bool, optional Whether output should be overwritten. In this case no jobs will be submitted for jobs with existing output files. Defaults to True :param pep_config: :obj:`str`, optional Pickle file of the analysis. Only required for "region" enrichment. """ # TODO: replace hardcoded paths with info from resources # TODO: remove pep_config requirement to "region_enrichment" import sys from glob import glob from ngs_toolkit.utils import submit_job dbs = { "human": "~/resources/motifs/motif_databases/HUMAN/HOCOMOCOv10.meme", "mouse": "~/resources/motifs/motif_databases/MOUSE/uniprobe_mouse.meme", } omap = {"hg38": "human", "hg19": "human", "mm10": "mouse"} jobs = list() # list of tuples with: job_name, log, exec, requirements (partition, cpu, mem, time), cmd # REGION if "region" in steps: files = glob(results_dir + "/*/*_regions.bed") for file in files: dir_ = os.path.dirname(file) name = os.path.basename(dir_) output_ = os.path.join(dir_, "region_type_enrichment.csv") if os.path.exists(output_) and (not overwrite): continue jobs.append( [ name + "_region", os.path.join(dir_, name + ".region.log"), os.path.join(dir_, name + ".region.sh"), ("shortq", 1, 8000, "08:00:00"), "{} -m ngs_toolkit.recipes.region_enrichment --output-file {} {} {}".format( sys.executable, output_, file, pep_config ), ] ) # LOLA if "lola" in steps: files = glob(results_dir + "/*/*_regions.bed") for file in files: dir_ = os.path.dirname(file) name = os.path.basename(dir_) output_ = os.path.join(dir_, "allEnrichments.tsv") if os.path.exists(output_) and (not overwrite): continue jobs.append( [ name + "_lola", os.path.join(dir_, name + ".lola.log"), os.path.join(dir_, name + ".lola.sh"), ("shortq", 2, 12000, "08:00:00"), "{} -m ngs_toolkit.recipes.lola {} {} {} {} -c 2".format( sys.executable, file, background_bed, dir_, genome ), ] ) # AME if "meme" in steps: files = glob(results_dir + "/*/*_regions.fa") for file in files: dir_ = os.path.dirname(file) name = os.path.basename(dir_) output_ = os.path.join(dir_, "ame.html") if os.path.exists(output_) and (not overwrite): continue jobs.append( [ name + "_meme", os.path.join(dir_, name + ".meme_ame.log"), os.path.join(dir_, name + ".meme_ame.sh"), ("shortq", 1, 4000, "08:00:00"), "fasta-dinucleotide-shuffle -c 1 -f {f} > {f}.shuffled.fa\n".format( f=file ) + " ame --bgformat 1 --scoring avg --method ranksum --pvalue-report-threshold 0.05" + " --control {f}.shuffled.fa -o {d} {f} {motifs}".format( f=file, d=dir_, motifs=dbs[omap[genome]] ), ] ) # HOMER if "homer" in steps: files = glob(results_dir + "/*/*_regions.bed") for file in files: dir_ = os.path.dirname(file) name = os.path.basename(dir_) output_ = os.path.join(dir_, "homerResults.html") if os.path.exists(output_) and (not overwrite): continue jobs.append( [ name + "_homer", os.path.join(dir_, name + ".homer.log"), os.path.join(dir_, name + ".homer.sh"), ("shortq", 8, 12000, "08:00:00"), "findMotifsGenome.pl {f} {genome}r {d} -size 1000 -h -p 2 -len 8,10,12,14 -noknown".format( f=file, d=dir_, genome=genome ), ] ) # Enrichr if "enrichr" in steps: files = glob(results_dir + "/*/*.gene_symbols.txt") for file in files: dir_ = os.path.dirname(file) name = os.path.basename(dir_) output_ = file.replace(".gene_symbols.txt", ".enrichr.csv") if os.path.exists(get_this_file_or_timestamped(output_)) and (not overwrite): continue jobs.append( [ name + "_enrichr", os.path.join(dir_, name + ".enrichr.log"), os.path.join(dir_, name + ".enrichr.sh"), ("shortq", 1, 4000, "08:00:00"), "{e} -m ngs_toolkit.recipes.enrichr {f} {o}".format( e=sys.executable, f=file, o=output_ ), ] ) for jobname, log_file, job_file, (partition, cores, mem, time), task in jobs: submit_job( task, job_file, log_file=log_file, jobname=jobname, partition=partition, cores=cores, mem=mem, time=time)
[docs]def project_to_geo( project, output_dir="geo_submission", samples=None, distributed=False, dry_run=False, **kwargs ): """ Prepare raw sequencing files for submission to GEO. Files will be copied or generated in a new directory ``output_dir``. It will get the raw BAM file(s) of each sample, and in case of ATAC-seq/ChIP-seq samples, the bigWig and peak files. If multiple BAM files exist for each sample, all will be copied and sequencially named with the "fileN" suffix, where "N" is the file number. For each copied file a md5sum will be calculated. A pandas DataFrame with info on the sample's files and md5sums will be returned. Attributes ---------- project : :obj:`peppy.Project` A :class:`peppy.Project` object to process. output_dir : :obj:`str`, optional Directory to create output. Will be created/overwriten if existing. Defaults to "geo_submission". samples : :obj:`list`, optional List of :class:`peppy.Sample` objects in project to restrict to. Defaults to all samples in project. distributed: :obj:`bool`, optional Whether processing should be distributed as jobs in a computing cluster for each sample. Currently available implementation supports a SLURM cluster only. Defaults is :obj:`False`. dry_run: :obj:`bool`, optional Whether copy/execution/submisison commands should be not be run, to test. Default is :obj:`False`. **kwargs : :obj:`dict` Additional keyword arguments will be passed to :meth:`ngs_toolkit.utils.submit_job` if ``distributed`` is :obj:`True`, and on to a :class:`divvy` submission template. Pass for example: * computing_configuration="slurm" * jobname="job" * cores=2 * mem=8000 * partition="longq" Returns ---------- :class:`pandas.DataFrame` Annotation of samples and their BAM, BigWig, narrowPeak files and respective md5sums. """ from ngs_toolkit.utils import submit_job output_dir = os.path.abspath(output_dir) if samples is None: samples = project.samples if not os.path.exists(output_dir): os.makedirs(output_dir) annot = pd.DataFrame(index=pd.Index([], name="sample_name")) for sample in samples: various = len(sample.data_path.split(" ")) > 1 cmd = "" for i, file in enumerate(sample.data_path.split(" ")): suffix = ".file{}".format(i) if various else "" # Copy raw file bam_file = os.path.join(output_dir, sample.name + "{}.bam".format(suffix)) cmd += "cp {} {};\n".format(file, bam_file) cmd += "chmod 644 {};\n".format(bam_file) annot.loc[sample.name, "bam_file{}".format(i)] = bam_file # Copy or generate md5sum md5_file = bam_file + ".md5" if os.path.exists(file + ".md5"): cmd += "cp {} {};\n".format(file + ".md5", md5_file) else: b = os.path.basename(file) cmd += "md5sum {} > {};\n".format(os.path.join(output_dir, b), md5_file) cmd += "chmod 644 {};\n".format(md5_file) annot.loc[sample.name, "bam_file{}_md5sum".format(i)] = md5_file # Copy bigWig files if sample.library in ["ATAC-seq", "ChIP-seq"]: if hasattr(sample, "bigwig"): bigwig_file = os.path.join(output_dir, sample.name + ".bigWig") cmd += "cp {} {};\n".format(sample.bigwig, bigwig_file) cmd += "chmod 644 {};\n".format(bigwig_file) annot.loc[sample.name, "bigwig_file"] = bigwig_file # Copy or generate md5sum md5_file = bigwig_file + ".md5" if os.path.exists(sample.bigwig + ".md5"): cmd += "cp {} {};\n".format(sample.bigwig + ".md5", md5_file) else: b = os.path.basename(sample.bigwig) cmd += "md5sum {} > {};\n".format( os.path.join(output_dir, b), md5_file ) cmd += "chmod 644 {};\n".format(md5_file) annot.loc[sample.name, "bigwig_file_md5sum"] = md5_file else: _LOGGER.warning( "'{}' sample '{}' does not have a 'bigwig'".format( sample.library, sample.name ) + " attribute set. Skipping bigWig file." ) # Copy peaks if sample.library == "ATAC-seq": if hasattr(sample, "peaks"): peaks_file = os.path.join(output_dir, sample.name + ".peaks.narrowPeak") cmd += "cp {} {};\n".format(sample.peaks, peaks_file) cmd += "chmod 644 {};\n".format(peaks_file) annot.loc[sample.name, "peaks_file"] = peaks_file # Copy or generate md5sum md5_file = peaks_file + ".md5" if os.path.exists(sample.peaks + ".md5"): cmd += "cp {} {};\n".format(sample.peaks + ".md5", md5_file) else: b = os.path.basename(sample.peaks) cmd += "md5sum {} > {};\n".format(peaks_file, md5_file) cmd += "chmod 644 {};\n".format(md5_file) annot.loc[sample.name, "peaks_file_md5sum"] = md5_file else: _LOGGER.warning( "'{}' sample '{}' does not have a 'peaks' attribute set.".format( sample.library, sample.name ) + " Skipping peaks file." ) cmd += "\ndate\n" # Assemble job job_name = "project_to_geo.{}".format(sample.name) log_file = os.path.join(output_dir, job_name + ".log") job_file = os.path.join(output_dir, job_name + ".sh") submit_job( cmd, job_file, log_file=log_file, jobname=job_name, cores=1, mem=8000, dry_run=dry_run, **kwargs) return annot
[docs]def rename_sample_files( annotation_mapping, old_sample_name_column="old_sample_name", new_sample_name_column="new_sample_name", tmp_prefix="rename_sample_files", results_dir="results_pipeline", dry_run=False): """ Rename existing directories with pipeline outputs for samples based on mapping of old/new sample names. All files within the directory with the old sample name will be renamed recursively. Old and new sample names can overlap - this procedure will handle these cases correctly by a 2-step process with temporary sample names with prefix ``tmp_prefix``. .. caution:: NEEDS TESTING! This function has not been used in a while and needs more testing. Attributes ---------- annotation_mapping : :obj:`pandas.DataFrame` DataFrame with mapping of old (column "previous_sample_name") vs new ("new_sample_name") sample names. old_sample_name_column : :obj:`str`, optional Name of column with old sample names. Defaults to "old_sample_name". new_sample_name_column : :obj:`str`, optional Name of column with new sample names. Defaults to "new_sample_name". tmp_prefix : :obj:`str`, optional Prefix for temporary files to avoid overlap between old and new names. Defaults to "rename_sample_files". results_dir : :obj:`str`, optional Pipeline output directory containing sample output directories. Defaults to "results_pipeline". dry_run: :obj:`bool`, optional Whether to print commands instead of running them. Defaults to :obj:`False`. """ import subprocess # TODO: test cmds = list() # 1) move to tmp name for i, series in annotation_mapping.iterrows(): o = series[old_sample_name_column] t = "{}-{}".format(tmp_prefix, i) cmds.append("# Moving old sample '{}' to '{}'.".format(o, t)) # directory cmds.append( "mv {} {}".format( os.path.join(results_dir, o), os.path.join(results_dir, t) ) ) # further directories cmds.append( "find {} -type d -exec rename {} {} {{}} \\;".format( os.path.join(results_dir, t), o, t ) ) # files cmds.append( "find {} -type f -exec rename {} {} {{}} \\;".format( os.path.join(results_dir, t), o, t ) ) # 2) move to new name for i, series in annotation_mapping.iterrows(): t = "{}-{}".format(tmp_prefix, i) n = series[new_sample_name_column] cmds.append("# Moving old sample '{}' to '{}'.".format(t, n)) # directory cmds.append( "mv {} {}".format( os.path.join(results_dir, t), os.path.join(results_dir, n) ) ) # further directories cmds.append( "find {} -type d -exec rename {} {} {{}} \\;".format( os.path.join(results_dir, n), t, n ) ) # files cmds.append( "find {} -type f -exec rename {} {} {{}} \\;".format( os.path.join(results_dir, n), t, n ) ) if dry_run: _LOGGER.info("\n".join(cmds)) else: for i, cmd in enumerate(cmds): _LOGGER.info(i, cmd) if cmd.startswith("#"): continue try: r = subprocess.call(cmd.split(" ")) except OSError as e: raise e if r != 0: raise OSError("Command '{}' failed.".format(cmd))
[docs]def query_biomart( attributes=None, species="hsapiens", ensembl_version="grch38", max_api_retries=5): """ Query Biomart (https://www.ensembl.org/biomart/martview/). Query Biomart for gene attributes. Returns pandas dataframe with query results. If a certain field contains commas, it will attemp to return dataframe but it might fail. Parameters ---------- attributes : :obj:`list`, optional List of ensembl atrributes to query. Defaults to ["ensembl_gene_id", "external_gene_name", "hgnc_id", "hgnc_symbol"]. species : :obj:`str`, optional Ensembl string of species to query. Must be vertebrate. Defaults to "hsapiens". ensembl_version : :obj:`str`, optional Ensembl version to query. Currently "grch37", "grch38" and "grcm38" are tested. Defaults to "grch37". max_api_retries : :obj:`int`, optional How many times to try . Defaults to "grch37". Returns ------- :obj:`pandas.DataFrame` Dataframe with required attributes for each entry. """ import requests import time supported = ["grch37", "grch38", "grcm38"] if ensembl_version not in supported: msg = "Ensembl version might not be supported." msg += " Tested versions are '{}'.".format("','".join(supported)) msg += " Will try anyway." _LOGGER.warning(msg) if attributes is None: attributes = ["ensembl_gene_id", "external_gene_name", "hgnc_id", "hgnc_symbol"] # Build request XML ens_ver = "" if ensembl_version.endswith("38") else ensembl_version + "." url_query = "".join( [ """http://{}ensembl.org/biomart/martservice?query=""".format(ens_ver), """<?xml version="1.0" encoding="UTF-8"?>""", """<!DOCTYPE Query>""", """<Query virtualSchemaName="default" formatter="CSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6" >""", """<Dataset name="{}_gene_ensembl" interface="default" >""".format(species), ] + ["""<Attribute name="{}" />""".format(attr) for attr in attributes] + ["""</Dataset>""", """</Query>"""] ) n_fails = 0 success = False while not success: req = requests.get(url_query, stream=True) if not req.ok: n_fails += 1 msg = "Request to Biomart API was not successful." if n_fails == max_api_retries: _LOGGER.error(msg) raise ValueError(msg) else: _LOGGER.warning(msg + " Retrying.") time.sleep(1) else: try: content = list(req.iter_lines()) success = True except (requests.exceptions.ChunkedEncodingError, requests.urllib3.exceptions.ProtocolError): msg = "Retrieving Biomart request failed." n_fails += 1 if n_fails == max_api_retries: _LOGGER.error(msg) raise ValueError(msg) else: _LOGGER.warning(msg + " Retrying.") time.sleep(1) if (len(content) == 1) and (content[0].startswith("Query ERROR")): msg = "Request to Biomart API was not successful. Check your input.\n{}".format( content[0] ) _LOGGER.error(msg) raise ValueError(msg) if isinstance(content[0], bytes): content = [x.decode("utf-8") for x in content] try: mapping = pd.DataFrame( [x.strip().split(",") for x in content], columns=attributes ) except AssertionError as e: msg = "Could not simply return dataframe with results." msg += " It is likely this is because one of the requested attributes has commas as is quoted." msg += " Or because querying an organism not present in vertebrate database." msg += " Will try to replace these fields and parse." _LOGGER.warning(msg) # input probably contains commas inside fields c = pd.Series([x.strip() for x in content]) # well's assume that fields with commas have been quoted # get text inside double quotes cc = c.str.extract('"(.*)"') if cc.shape[1] > 1: _LOGGER.error("Attempt to fix failed.") raise e cc = cc.squeeze().str.replace(",", "") # now get text until first quote and concatenate with clean text f = c.str.extract('(.*),"').fillna("").squeeze() + "," + cc # now get back together with instances that didn't have quotes c.update(f) try: mapping = pd.DataFrame( [x.strip().split(",") for x in c], columns=attributes ) except AssertionError: _LOGGER.error("Attempt to fix failed.") raise e _LOGGER.info("Attempt to fix successful.") return mapping.replace("", np.nan)
[docs]def subtract_principal_component( x, pc=1, norm=False, plot=True, plot_name="PCA_based_batch_correction.svg", max_pcs_to_plot=6, ): """ Given a matrix (n_samples, n_variables), remove `pc` (1-based) from matrix. """ import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from ngs_toolkit.graphics import savefig pc -= 1 # All regions if norm: x = StandardScaler().fit_transform(x) # PCA pca = PCA() x_hat = pca.fit_transform(x) # Remove PC x2 = x - np.outer(x_hat[:, pc], pca.components_[pc, :]) max_pcs_to_plot = min(max_pcs_to_plot, x2.shape[0]) - 1 # plot if plot: x2_hat = pca.fit_transform(x2) fig, axis = plt.subplots(max_pcs_to_plot, 2, figsize=(4 * 2, 4 * max_pcs_to_plot)) for pc in range(max_pcs_to_plot): # before for j, _ in enumerate(x.index): axis[pc, 0].scatter( x_hat[j, pc], x_hat[j, pc + 1], s=50, rasterized=True ) axis[pc, 0].set_xlabel("PC{}".format(pc + 1)) axis[pc, 0].set_ylabel("PC{}".format(pc + 2)) # after for j, _ in enumerate(x2.index): axis[pc, 1].scatter( x2_hat[j, pc], x2_hat[j, pc + 1], s=35, alpha=0.8, rasterized=True ) axis[pc, 1].set_xlabel("PC{}".format(pc + 1)) axis[pc, 1].set_ylabel("PC{}".format(pc + 2)) savefig(fig, plot_name) return x2
[docs]def subtract_principal_component_by_attribute(df, attributes, pc=1): """ Given a matrix (n_samples, n_variables), remove `pc` (1-based) from matrix. """ from sklearn.decomposition import PCA pc -= 1 x2 = pd.DataFrame(index=df.index, columns=df.columns) for attr in attributes: _LOGGER.info(attr) sel = df.index[df.index.str.contains(attr)] x = df.loc[sel, :] # PCA pca = PCA() x_hat = pca.fit_transform(x) # Remove PC x2.loc[sel, :] = x - np.outer(x_hat[:, pc], pca.components_[pc, :]) for sample in df.index: if x2.loc[sample, :].isnull().all(): x2.loc[sample, :] = df.loc[sample, :] return x2
[docs]def fix_batch_effect_limma(matrix, batch_variable="batch", covariates=None): """ Fix batch effect in matrix using limma. Requires the R package "limma" to be installed: .. highlight:: R .. code-block:: R if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("limma") Parameters ---------- matrix : :obj:`pandas.DataFrame` DataFrame with MultiIndex for potential covariate annotations formula : :obj:`str`, optional Model formula to regress out Defaults to "~batch" Returns ------- :obj:`pandas.DataFrame` Regressed out matrix """ import patsy from rpy2.robjects import numpy2ri, pandas2ri, r from rpy2.robjects.packages import importr numpy2ri.activate() pandas2ri.activate() importr("limma") if covariates is None: covariates = [] if len(covariates) > 0: cov = patsy.dmatrix( "~{} - 1".format(" + ".join(covariates)), matrix.columns.to_frame() ) fixed = r("removeBatchEffect")( x=matrix.values, batch=matrix.columns.get_level_values(batch_variable), design=cov, ) else: fixed = r("removeBatchEffect")( x=matrix.values, batch=matrix.columns.get_level_values(batch_variable) ) fixed = pd.DataFrame(np.asarray(fixed), index=matrix.index, columns=matrix.columns) return fixed
# fixed.to_csv(os.path.join(analysis.results_dir, analysis.name + "_peaks.limma_fixed.csv"))