Source code for ngs_toolkit.chipseq

#!/usr/bin/env python


import os

import numpy as np
import pandas as pd

from ngs_toolkit import _LOGGER
from ngs_toolkit.atacseq import ATACSeqAnalysis


[docs]class ChIPSeqAnalysis(ATACSeqAnalysis): """ Class to model analysis of ChIP-seq data. Inherits from the :class:`~ngs_toolkit.atacseq.ATACSeqAnalysis` class. Parameters ---------- name : :obj:`str`, optional Name of the analysis. Defaults to "analysis". from_pep : :obj:`str`, optional PEP configuration file to initialize analysis from. The analysis will adopt as much attributes from the PEP as possible but keyword arguments passed at initialization will still have priority. Defaults to :obj:`None` (no PEP used). from_pickle : :obj:`str`, optional Pickle file of an existing serialized analysis object from which the analysis should be loaded. Defaults to :obj:`None` (will not load from pickle). root_dir : :obj:`str`, optional Base directory for the project. Defaults to current directory or to what is specified in PEP if :attr:`~ngs_toolkit.analysis.Analysis.from_pep`. data_dir : :obj:`str`, optional Directory containing processed data (e.g. by looper) that will be input to the analysis. This is in principle not required. Defaults to "data". results_dir : :obj:`str`, optional Directory to contain outputs produced by the analysis. Defaults to "results". prj : :class:`peppy.Project`, optional A :class:`peppy.Project` object that this analysis is tied to. Defaults to :obj:`None`. samples : :obj:`list`, optional List of :class:`peppy.Sample` objects that this analysis is tied to. Defaults to :obj:`None`. kwargs : :obj:`dict`, optional Additional keyword arguments will be passed to parent class :class:`~ngs_toolkit.atacseq.ATACSeqAnalysis`. """ _data_type = "ChIP-seq" def __init__( self, name=None, from_pep=False, from_pickle=False, root_dir=None, data_dir="data", results_dir="results", prj=None, samples=None, **kwargs ): # The check for existance is to make sure other classes can inherit from this default_args = { "data_type": "ChIP-seq", "__data_type__": "ChIP-seq", "var_unit_name": "region", "quantity": "binding", "norm_units": "RPM"} for k, v in default_args.items(): if not hasattr(self, k): setattr(self, k, v) super(ChIPSeqAnalysis, self).__init__( name=name, from_pep=from_pep, from_pickle=from_pickle, root_dir=root_dir, data_dir=data_dir, results_dir=results_dir, prj=prj, samples=samples, **kwargs )
[docs] def call_peaks_from_comparisons( self, comparison_table=None, output_dir="{results_dir}/chipseq_peaks", permissive=True, overwrite=True, distributed=True, ): """ Call peaks for ChIP-seq samples using an annotation of which samples belong in each comparison and which samples represent signal or background. Parameters ---------- comparison_table : :obj:`pandas.DataFrame` Comparison table with the following required columns: "comparison_name", "sample_name", "comparison_side", "sample_group". Defaults to analysis' own `comparison_table`. output_dir : :obj:`str` Parent directory where peaks will be created. Will be created if does not exist. permissive: :obj:`bool` If incomplete/incoherent comparisons should be skipped or an error should be thrown. Default is :obj:`True`. overwrite: :obj:`bool` If incomplete/incoherent comparisons should be skipped or an error should be thrown. Default is :obj:`True`. distributed: :obj:`bool` Whether peak calling should be run in serial or in distributed mode as jobs. Default is :obj:`True`. Raises ---------- ValueError If not `permissive` and incomplete/incoherent comparisons are detected. """ import subprocess from ngs_toolkit.utils import ( macs2_call_chipseq_peak, homer_call_chipseq_peak_job, ) from tqdm import tqdm if comparison_table is None: comparison_table = self.comparison_table req_columns = [ "comparison_name", "sample_name", "comparison_side", "sample_group", ] msg = "Comparison table is missing some of the following columns: '{}'.".format( ",".join(req_columns) ) if not all([col in comparison_table.columns for col in req_columns]): _LOGGER.error(msg) raise AssertionError(msg) # Complement default `output_dir` if "{results_dir}" in output_dir: output_dir = os.path.abspath( output_dir.format(results_dir=self.results_dir) ) if not os.path.exists(output_dir): os.makedirs(output_dir) # For each comparison comps = comparison_table["comparison_name"].drop_duplicates().sort_values() for comparison in tqdm(comps, total=len(comps), desc="Comparison"): # If there aren't two sides to each comparison, skip it or throw error if ( len( set( comparison_table[ (comparison_table["comparison_name"] == comparison) ]["comparison_side"].tolist() ) ) != 2 ): error = "Comparison '{}' does not contain two sides.".format(comparison) if permissive: _LOGGER.warning(error) continue else: _LOGGER.error(error) raise ValueError(error) # Get the sample names of samples in each side pos_names = comparison_table[ (comparison_table["comparison_name"] == comparison) & (comparison_table["comparison_side"] == 1) ]["sample_name"].tolist() neg_names = comparison_table[ (comparison_table["comparison_name"] == comparison) & (comparison_table["comparison_side"] < 1) ]["sample_name"].tolist() # Now get the actual samples signal_samples = [s for s in self.samples if s.name in pos_names] control_samples = [s for s in self.samples if s.name in neg_names] if (len(signal_samples) == 0) or (len(control_samples) == 0): error = "Comparison side for '{}' comparison does not contain samples.".format( comparison ) if permissive: print(error) continue else: raise ValueError(error) print( "Doing comparison '{}' with positive samples '{}' and background samples '{}'".format( comparison, [s.name for s in signal_samples], [s.name for s in control_samples], ) ) # Call peaks cmds = list() kwargs = { "signal_samples": signal_samples, "control_samples": control_samples, "output_dir": output_dir, "name": comparison, "distributed": distributed} if overwrite: cmds += [ macs2_call_chipseq_peak(**kwargs), homer_call_chipseq_peak_job(**kwargs) ] else: if not os.path.exists( os.path.join( output_dir, comparison, comparison + "_peaks.narrowPeak" ) ): cmds += [ macs2_call_chipseq_peak(**kwargs) ] if not os.path.exists( os.path.join( output_dir, comparison, comparison + "_homer_peaks.narrowPeak" ) ): cmds += [ homer_call_chipseq_peak_job(**kwargs) ] else: _LOGGER.warning( "Peak files for comparison '{}' already exist. Skipping.".format( comparison ) ) if not distributed: for cmd in cmds: _LOGGER.info( "Calling peaks for comparison '{}' with command: '{}'.\n".format( comparison, cmd ) ) subprocess.call(cmd.split(" "))
[docs] def filter_peaks( self, comparison_table=None, filter_bed=None, peaks_dir="{results_dir}/chipseq_peaks", ): """ Filter peak calls for various comparisons for entries that do not overlap another BED file. Parameters ---------- comparison_table : :obj:`pandas.DataFrame`, optional Comparison table with the following required columns: "comparison_name", "sample_name", "comparison_side", "sample_group". Defaults to analysis' own `comparison_table`. filter_bed : :obj:`str` BED file with entries to filter out from the BED files of each comparison. Defaults to the set of Blacklisted regions from the analysis' genome. In that case it will be fetched if not present. peaks_dir : :obj:`str` Parent directory where peak calls for each comparison exist. Will be created if does not exist. Defaults to "{results_dir}/chipseq_peaks". Raises ---------- AttributeError If `filter_bed` is not given and failes to be retrieved. """ import pybedtools from ngs_toolkit.utils import homer_peaks_to_bed def _filter(input_bed, filter_bed, output_bed): """ Filter BED file for entries that overlap another BED file. Parameters ---------- input_bed : :obj:`str` BED file to filter. filter_bed : :obj:`str` BED file with entries to filter from input_bed. output_bed : :obj:`str` Output BED file. """ ( pybedtools.BedTool(input_bed) .intersect(pybedtools.BedTool(filter_bed), v=True) .saveas(output_bed) ) if comparison_table is None: comparison_table = self.comparison_table if filter_bed is None: _LOGGER.info("Blacklist file not provided. Downloading...") try: filter_bed = self.get_resources(steps=["blacklist"])[ "blacklist_file" ] except AttributeError: msg = "Blacklist file was not provided and cannot be" msg += " get one without analysis having `organism` and `genome` set." _LOGGER.error(msg) raise AttributeError(msg) peaks_dir = self._format_string_with_attributes(peaks_dir) if not os.path.exists(peaks_dir): os.makedirs(peaks_dir) for comparison in comparison_table["comparison_name"].unique(): # MACS2 prefix = os.path.join(peaks_dir, comparison, comparison + "_peaks") _filter(prefix + ".narrowPeak", filter_bed, prefix + ".filtered.bed") # HOMER prefix = os.path.join(peaks_dir, comparison, comparison + "_homer_peaks") homer_peaks_to_bed(prefix + ".factor.narrowPeak", prefix + ".factor.bed") _filter(prefix + ".factor.bed", filter_bed, prefix + ".factor.filtered.bed") homer_peaks_to_bed(prefix + ".histone.narrowPeak", prefix + ".histone.bed") _filter( prefix + ".histone.bed", filter_bed, prefix + ".histone.filtered.bed" )
[docs] def summarize_peaks_from_comparisons( self, comparison_table=None, output_dir="{results_dir}/chipseq_peaks", filtered=True, permissive=True, ): """ Call peaks for ChIP-seq samples using an annotation of which samples belong in each comparison and which samples represent signal or background. Parameters ---------- comparison_table : :obj:`pandas.DataFrame`, optional Comparison table with the following required columns: "comparison_name", "sample_name", "comparison_side", "sample_group". Defaults to analysis' own `comparison_table`. output_dir : :obj:`str` Parent directory where peaks will be created. Will be created if does not exist. permissive: :obj:`bool` If incomplete/incoherent comparisons should be skipped or an error should be thrown. Raises ---------- ValueError Will be raised if not `permissive` and incomplete/incoherent comparisons are detected. """ from ngs_toolkit.utils import homer_peaks_to_bed if comparison_table is None: comparison_table = self.comparison_table req_columns = [ "comparison_name", "sample_name", "comparison_side", "sample_group", ] msg = "Comparison table is missing some of the following columns: '{}'.".format( ",".join(req_columns) ) if not all([col in comparison_table.columns for col in req_columns]): _LOGGER.error(msg) raise AssertionError(msg) # Complement default `output_dir` if "{results_dir}" in output_dir: output_dir = os.path.abspath( output_dir.format(results_dir=self.results_dir) ) # For each comparison, count called peaks peak_counts = pd.DataFrame() for comparison in ( comparison_table["comparison_name"].drop_duplicates().sort_values() ): _LOGGER.info(comparison) ending = ".filtered.bed" if filtered else ".narrowPeak" for peak_type, file in [ ( "macs", os.path.join( output_dir, comparison, comparison + "_peaks" + ending ), ), ( "homer_factor", os.path.join( output_dir, comparison, comparison + "_homer_peaks.factor" + ending, ), ), ( "homer_histone", os.path.join( output_dir, comparison, comparison + "_homer_peaks.histone" + ending, ), ), ]: error = "Peak files for comparison '{}' with '{}' parameters don't exist.".format( comparison, peak_type ) if "homer" in peak_type and not filtered: try: homer_peaks_to_bed(file, file.replace("narrowPeak", "bed")) except IOError: if permissive: _LOGGER.warning(error) peak_counts = peak_counts.append( pd.Series([comparison, peak_type, np.nan]), ignore_index=True, ) continue else: raise except pd.errors.EmptyDataError: peak_counts = peak_counts.append( pd.Series([comparison, peak_type, 0.0]), ignore_index=True ) file = file.replace("narrowPeak", "bed") try: df = pd.read_csv(file, sep="\t") except IOError: if permissive: _LOGGER.warning(error) peak_counts = peak_counts.append( pd.Series([comparison, peak_type, np.nan]), ignore_index=True, ) continue else: raise except pd.errors.EmptyDataError: peak_counts = peak_counts.append( pd.Series([comparison, peak_type, 0.0]), ignore_index=True ) peak_counts = peak_counts.append( pd.Series([comparison, peak_type, df.shape[0]]), ignore_index=True ) peak_counts.columns = ["comparison_name", "peak_type", "peak_counts"] return peak_counts # .fillna(0)
[docs] def get_consensus_sites( self, samples=None, region_type="summits", extension=250, blacklist_bed=None, filter_chroms=True, permissive=False, save=True, assign=True, **kwargs): """ Get consensus (union) of enriched sites (peaks) across all comparisons. There are two modes possible, defined by the value of ``region_type``: * peaks: simple union of all sites; * summits: peak summits are extended by ``extension`` and a union is made. For ChIP-seq, the ``comparison_table`` keyword argument or a ``comparison_table`` attribute set is required. Peaks/summits will be aggregated for the peaks called in each sample comparison. Parameters ---------- samples : :obj:`list` Iterable of :class:`peppy.Sample` objects to restrict to. Must have a ``peaks`` attribute set. Defaults to all samples in the analysis (``samples`` attribute). region_type : :obj:`str` The type of region to use to create the consensus region set - one of "summits" or "peaks". If "summits", peak summits will be extended by ``extension`` before union. If "peaks", sample peaks will be used with no modification prior to union. Default is "summits". extension : :obj:`int` Amount to extend peaks summits by in both directions. Default is 250. blacklist_bed : {:obj:`False`, :obj:`str`} Either :obj:`False` or a path to a BED file with genomic positions to exclude from consensus peak set. Default is to use a blacklist file for the analysis ``genome``. filter_chroms : {:obj:`list`, :obj:`str`} A list of chromosomes to filter out or a string with a pattern to match to exclude chromosomes. Uses Pandas string methods :class:`pandas.Series.str.match`. Pass for example `'.*_.*|chrM'` to filter out chromosomes with a "_" character and a "chrM" chromosome. Default is not to filter anything. permissive : :obj:`bool` Whether Samples that which ``region_type`` attribute file does not exist should be simply skipped or an error thrown. comparison_table : :obj:`pandas.DataFrame`, optional DataFrame with signal/background combinations used to call peaks. Part of kwargs. Defaults to analysis own ``comparison_table``. peak_dir : :obj:`str`, optional Path to peaks output directory. Part of kwargs. Defaults to "{analysis.results_dir}/chipseq_peaks". Attributes ---------- sites : :class:`pybedtools.BedTool` Bedtool with consensus sites. """ import re from ngs_toolkit.general import get_blacklist_annotations import pybedtools from tqdm import tqdm import tempfile if "comparison_table" not in kwargs: comparison_table = self.comparison_table else: comparison_table = kwargs["comparison_table"] if "peak_dir" not in kwargs: peak_dir = os.path.join(self.results_dir, "chipseq_peaks") if region_type not in ["summits", "peaks"]: msg = "`region_type` attribute must be one of 'summits' or 'peaks'!" _LOGGER.error(msg) raise ValueError(msg) if blacklist_bed is None: _LOGGER.info("Blacklist file not provided. Downloading...") try: blacklist_bed = get_blacklist_annotations(self.organism, self.genome) except AttributeError: msg = "Blacklist file was not provided and cannot" msg += " get one without analysis having `organism` and `genome` set." _LOGGER.error(msg) raise AttributeError(msg) comps = comparison_table["comparison_name"].drop_duplicates() f = tempfile.NamedTemporaryFile() with open(f.name, "a") as handle: for comparison in tqdm(comps, total=len(comps), desc="Comparison"): peak_files = [ os.path.join(peak_dir, comparison, comparison + "_peaks.narrowPeak"), os.path.join( peak_dir, comparison, comparison + "_homer_peaks.factor.bed" ), os.path.join( peak_dir, comparison, comparison + "_homer_peaks.histone.bed" ), ] for peak_file in peak_files: genome = ( comparison_table.loc[ comparison_table["comparison_name"] == comparison, "comparison_genome", ] .drop_duplicates() .squeeze() ) msg = "Could not determine genome of comparison '{}'.".format( comparison ) if not isinstance(genome, str): _LOGGER.error(msg) raise AssertionError(msg) # Get peak file try: f = re.sub("_peaks.narrowPeak", "_summits.bed", peak_file) file = ( pybedtools.BedTool(f) .slop(b=extension, genome=genome).fn if region_type == "summits" else peak_file) except (ValueError, FileNotFoundError): _LOGGER.warning( "Input file for comparison {} ({}) not found!" .format(comparison, f)) if not permissive: raise for line in open(file, 'r'): handle.write(line) # Merge overlaping peaks across comparisons sites = pybedtools.BedTool(f.name).sort().merge() # Filter # # remove blacklist regions if blacklist_bed is not False: if not isinstance(blacklist_bed, pybedtools.BedTool): blacklist = pybedtools.BedTool(blacklist_bed) sites = sites.intersect(v=True, b=blacklist) # # filter requested chromosomes if filter_chroms is not None: if isinstance(filter_chroms, list): sites = sites.filter(lambda x: x.chrom not in filter_chroms).saveas() elif isinstance(filter_chroms, str): s = sites.to_dataframe() sites = pybedtools.BedTool.from_dataframe(s.loc[~s['chrom'].str.match(filter_chroms)]) # Save and assign if save: output_file = os.path.join(self.results_dir, self.name + ".peak_set.bed") sites.saveas(output_file) sites = pybedtools.BedTool(output_file) if assign: self.sites = sites return sites
[docs] def calculate_peak_support( self, samples=None, region_type="summits", permissive=False, comparison_table=None, peak_dir="{results_dir}/chipseq_peaks"): """ Calculate a measure of support for each region in peak set (i.e. ratio of samples containing a peak overlapping region in union set of peaks). Parameters ---------- comparison_table : :obj:`pandas.DataFrame`, optional DataFrame with signal/background combinations used to call peaks Defaults to analysis' own `comparison_table`. peak_dir : :obj:`str`, optional Path to peaks output directory. Defaults to {analysis.results_dir}/chipseq_peaks samples: :obj:`list` Not used. Provided for compatibility with ATACSeqAnalysis class. region_type: :obj:`str` Not used. Provided for compatibility with ATACSeqAnalysis class. permissive: :obj:`bool` Not used. Provided for compatibility with ATACSeqAnalysis class. Attributes ---------- support : :obj:`pandas.DataFrame` DataFrame with signal/background combinations used to call peaks """ import pybedtools from tqdm import tqdm from ngs_toolkit.utils import bed_to_index if comparison_table is None: comparison_table = self.comparison_table peak_dir = os.path.abspath(self._format_string_with_attributes(peak_dir)) # get index index = bed_to_index(self.sites.to_dataframe()) # calculate support (number of samples overlaping each merged peak) comps = comparison_table["comparison_name"].drop_duplicates() support = pd.DataFrame(index=index) for comparison in tqdm(comps, total=comps.shape[0], desc="Comparison"): peak_files = [ ( "MACS", os.path.join( peak_dir, comparison, comparison + "_peaks.narrowPeak" ), ), ( "HOMER_factor", os.path.join( peak_dir, comparison, comparison + "_homer_peaks.factor.bed" ), ), ( "HOMER_histone", os.path.join( peak_dir, comparison, comparison + "_homer_peaks.histone.bed" ), ), ] for peak_type, peak_file in peak_files: try: sample_support = self.sites.intersect( peak_file, wa=True, c=True ).to_dataframe() except ( ValueError, pybedtools.MalformedBedLineError, pybedtools.helpers.BEDToolsError, ): _LOGGER.warning( "Peaks for comparison {} ({}) not found!".format( comparison, peak_file ) ) continue sample_support.index = index support[(comparison, peak_type)] = sample_support.iloc[:, 3] # Make multiindex labeling comparisons and peak type support.columns = pd.MultiIndex.from_tuples( support.columns, names=["comparison", "peak_type"] ) support.to_csv( os.path.join( self.results_dir, self.name + "_peaks.binary_overlap_support.csv" ), index=True, ) # divide sum (of unique overlaps) by total to get support value between 0 and 1 support["support"] = support.astype(bool).astype(int).sum(axis=1) / float( support.shape[1] ) # save support.to_csv( os.path.join(self.results_dir, self.name + "_peaks.support.csv"), index=True ) self.support = support
[docs] def get_supported_peaks(self, samples=None, **kwargs): """ Get mask of sites with 0 support in the given samples. Requires support matrix produced by `ngs_toolkit.atacseq.ATACSeqAnalysis.calculate_peak_support`. Parameters ---------- samples : :obj:`list` Not used. Provided for compatibility with ATACSeqAnalysis class. comparisons : :obj:`list` Iterable of comparison names to restrict to. Must match name of comparisons used in comparison_table. Returns ------- pd.Series Boolean Pandas Series with sites with at least one of the given samples having a peak called. """ if "comparisons" not in kwargs: msg = "Requires a keyword argument `comparisons`." _LOGGER.error(msg) raise ValueError(msg) return self.support[[c for c in kwargs["comparisons"]]].sum(1) != 0
[docs] def normalize_by_background( self, comparison_table=None, reduction_func=np.mean, comparison_func=np.subtract, by_group=False, matrix="matrix_norm", samples=None): """ Normalize values in matrix by background samples in a comparison-specific way as specified in `comparison_table`. The background samples will be pooled by the `reduction_func` and their values wil be removed from the signal samples using the `comparison_func`. Parameters ---------- comparison_table : :class:`pandas.DataFrame` Table with comparisons from which peaks were called. Defaults to analysis' `comparison_table`. reduction_func : func Function to reduce the region to gene values to. Defaults to :obj:`numpy.mean`. comparison_func : func Function to use for normalization of signal samples against background samples. You can also try for example :obj:`numpy.divide`. Defaults to :obj:`numpy.subtract`. by_group : :obj:`bool` Whether output should be by group (:obj:`True`) or for each sample (:obj:`False`). Default is :obj:`False`. matrix : {:class:`pandas.DataFrame`, :obj:`str`, optional} Name of attribute or pandas DataFrame to use. Defaults to "matrix_norm". samples : :obj:`list`, optional Subset of samples to consider. Defaults to all samples in analysis. Returns ------- :class:`pandas.DataFrame` Dataframe with values normalized by background samples. """ if comparison_table is None: comparison_table = self.comparison_table if samples is None: samples = self.samples matrix = self.get_matrix(matrix, samples=samples) comparisons = set(comparison_table['comparison_name']) s_names = [s.name for s in samples] if not by_group: res = pd.DataFrame(index=matrix.index) for comparison in comparisons: comp = comparison_table.query("comparison_name == '{}'".format(comparison)) signal_samples = [s for s in comp.query("comparison_side == 1")['sample_name'] if s in s_names] background_samples = [s for s in comp.query("comparison_side == 0")['sample_name'] if s in s_names] for s in signal_samples: res.loc[:, s] = comparison_func( matrix.loc[:, s], matrix.loc[:, background_samples].apply(reduction_func, axis=1) if reduction_func != np.mean else matrix.loc[:, background_samples].mean(axis=1)) if by_group: comparisons = set(comparison_table['comparison_name']) s_names = [s.name for s in samples] res = pd.DataFrame(index=matrix.index, columns=comparisons) for comparison in comparisons: comp = comparison_table.query("comparison_name == '{}'".format(comparison)) signal_samples = [s for s in comp.query("comparison_side == 1")['sample_name'] if s in s_names] background_samples = [s for s in comp.query("comparison_side == 0")['sample_name'] if s in s_names] res.loc[:, comparison] = comparison_func( matrix.loc[:, signal_samples].apply(reduction_func, axis=1) if reduction_func != np.mean else matrix.loc[:, signal_samples].mean(axis=1), matrix.loc[:, background_samples].apply(reduction_func, axis=1) if reduction_func != np.mean else matrix.loc[:, background_samples].mean(axis=1)) return res