Source code for ngs_toolkit.parsers

#!/usr/bin/env python

import os

import pandas as pd


[docs]def parse_ame(ame_output): """ Parse results of MEME-AME motif enrichment. Parameters ---------- ame_output : :obj:`str` MEME-AME results file. Returns ---------- pandas.DataFrame Data frame with enrichment statistics for each found TF motif. Raises ------- IOError If directory contain """ with open(ame_output, "r") as handle: lines = handle.readlines() output = list() for line in lines: # skip header lines if line[0] not in [str(i) for i in range(10)]: continue # get motif string and the first half of it (simple name) motif = line.strip().split(" ")[5].split("_")[0] # get corrected p-value q_value = float(line.strip().split(" ")[-2]) # append output.append((motif, q_value)) r = pd.Series(dict(output)).reset_index() r.columns = ["TF", "p_value"] return r
[docs]def parse_homer(homer_dir): """ Parse results of HOMER findMotifs.pl de novo motif enrichment. Parameters ---------- homer_dir : :obj:`str` Directory with HOMER results. Returns ---------- pandas.DataFrame Data frame with enrichment statistics for each found TF motif. Raises ------- IOError """ import glob import re motif_htmls = sorted(glob.glob(os.path.join(homer_dir, "motif*.info.html"))) if len(motif_htmls) < 1: raise IOError("Homer directory does not contain any discovered motifs.") output = pd.DataFrame() for motif_html in motif_htmls: motif = int( re.sub( ".info.html", "", re.sub(os.path.join(homer_dir, "motif"), "", motif_html), ) ) with open(motif_html, "r") as handle: content = handle.read() # Parse table with motif info info_table = content[ re.search("""<TABLE border="1" cellpading="0" cellspacing="0">""", content) .end(): re.search("</TABLE>", content) .start() ].strip() info_table = pd.DataFrame( [ x.split("</TD><TD>") for x in info_table.replace("<TR><TD>", "").split("</TD></TR>") ] ) info_table.columns = ["description", "value"] info_table["description"] = info_table["description"].str.strip() info_table["motif"] = motif # Add most probable known motif name info_table["known_motif"] = content[ re.search("<H4>", content).end(): re.search("</H4>", content).start() ] # append output = output.append(info_table, ignore_index=True) return output.sort_values("motif")
[docs]def parse_great_enrichment(input_tsv): """ Parse output from GREAT enrichment (http://great.stanford.edu). Parameters ---------- input_tsv : :obj:`str` TSV file exported from GREAT through the option "All data as .tsv" in "Global Controls". Returns ---------- pandas.DataFrame Pandas dataframe with enrichment results. """ df = pd.read_csv(input_tsv, sep="\t", skiprows=3) df.columns = df.columns.str.replace("# ", "") return df.loc[~df.iloc[:, 0].str.startswith("#")]