· 7 years ago · Dec 13, 2018, 01:18 PM
1def plot_differential_enrichment(
2 enrichment_table,
3 enrichment_type,
4 data_type="ATAC-seq",
5 direction_dependent=True,
6 output_dir="results/differential_analysis_{data_type}/enrichments",
7 comp_variable="comparison_name",
8 output_prefix="differential_analysis",
9 rasterized=True,
10 barplots=True,
11 correlation_plots=True,
12 clustermap_metric='correlation',
13 top_n=5,
14 z_score=0,
15 cmap=None):
16 """
17 Given a table of enrichment terms across several comparisons, produced
18 plots illustrating these enrichments in the various comparisons.
19
20 # TODO: add plotting of genomic region and chromatin state enrichment.
21 `comp_variable` is the column in the enrichment table that labels groups.
22
23 # TODO: split function in its smaller parts and call them appropriately.
24
25 :param enrichment_table: Data frame with enrichment results as produced by
26 ``ngs_toolkit.general.differential_enrichment`` or
27 ``ngs_toolkit.general.collect_differential_enrichment``.
28 :type enrichment_table: pandas.DataFrame
29 :param enrichment_type: One of 'lola', 'enrichr', 'motif', 'plot_differential_enrichment', great'.
30 :type enrichment_type: str
31 :param data_type: Data type. One of "ATAC-seq" and "RNA-seq". Defaults to "ATAC-seq".
32 :type data_type: str, optional
33 :param direction_dependent: Whether enrichments were made in a direction-dependent way
34 (up-regulated and down-regulated features separately).
35 This implies a column named "direction" exists". Defaults to True.
36 :type direction_dependent: bool, optional
37 :param output_dir: Directory to create output files. Defaults to "results/differential_analysis_{data_type}/enrichments".
38 :type output_dir: str, optional
39 :param comp_variable: Column defining which comparison enrichment terms belong to. Defaults to "comparison_name".
40 :type comp_variable: str, optional
41 :param output_prefix: Prefix to use when creating output files. Defaults to "differential_analysis".
42 :type output_prefix: str, optional
43 :param rasterized: Whether or not to rasterize heatmaps for efficient plotting. Defaults to True.
44 :type rasterized: bool, optional
45 :param barplots: Whether barplots with top enriched terms per comparison should be produced. Defaults to True.
46 :type barplots: bool, optional
47 :param correlation_plots: Whether correlation plots of comparisons across enriched terms should be produced. Defaults to True.
48 :type correlation_plots: bool, optional
49 :param clustermap_metric: Distance metric to use for clustermap clustering, Default to "correlation" (Pearson's).
50 :type clustermap_metric: str, optional
51 :param top_n: Top terms to be used to make barplots. Defaults to 5
52 :type top_n: number, optional
53 :param z_score: Which dimention/axis to perform Z-score transformation for. Numpy/Pandas conventions are used:
54 `0` is row-wise (in this case across comparisons) and `1` is column-wise (across terms). Defaults to 0.
55 :type z_score: number, optional
56 :param cmap: Colormap to use in heatmaps. Default None.
57 :type cmap: str, optional
58 """
59 import numpy as np
60 import matplotlib
61 import matplotlib.pyplot as plt
62 import seaborn as sns
63 from scipy.stats import zscore
64
65 if enrichment_type not in ["homer_consensus"']:
66 raise AssertionError("`enrichment_type` must be 'homer_consensus'.")
67
68 if "{data_type}" in output_dir:
69 output_dir = output_dir.format(data_type=data_type)
70 if not os.path.exists(output_dir):
71 os.makedirs(output_dir)
72
73 if z_score == 0:
74 z_score_label = "Row"
75 elif z_score == 1:
76 z_score_label = "Column"
77 elif z_score is None:
78 pass
79 else:
80 raise ValueError("Argument 'z_score' must be on of 0, 1 or None.")
81
82 enrichment_table = enrichment_table.copy()
83 if "direction" in enrichment_table.columns and direction_dependent:
84 enrichment_table[comp_variable] = enrichment_table[comp_variable].astype(
85 str) + " " + enrichment_table["direction"].astype(str)
86
87 if enrichment_type == "homer_consensus":
88 enrichment_table.loc[:, "log_p_value"] = -(enrichment_table["Log P-value"].astype(np.float64))
89 # Plot top_n terms of each comparison in barplots
90 top_n = min(top_n, enrichment_table.set_index("Motif Name").groupby(comp_variable)["log_p_value"].count().min() - 1)
91 top_data = enrichment_table.set_index("Motif Name").groupby(comp_variable)["log_p_value"].nlargest(top_n).reset_index()
92
93 if barplots:
94 n = len(enrichment_table[comp_variable].drop_duplicates())
95 n_side = int(np.ceil(np.sqrt(n)))
96
97 fig, axis = plt.subplots(n_side, n_side, figsize=(4 * n_side, n_side * max(5, 0.12 * top_n)), sharex=False, sharey=False)
98 axis = iter(axis.flatten())
99 for i, comp in enumerate(top_data[comp_variable].drop_duplicates().sort_values()):
100 df2 = top_data.loc[top_data[comp_variable] == comp, :]
101 ax = next(axis)
102 sns.barplot(
103 df2["log_p_value"], df2["Motif Name"], estimator=max,
104 orient="horizontal", ax=ax, color=sns.color_palette("colorblind")[0])
105 ax.set_title(comp)
106 for ax in axis:
107 ax.set_visible(False)
108 sns.despine(fig)
109 fig.savefig(
110 os.path.join(output_dir, output_prefix + ".homer_consensus.barplot.top_{}.svg".format(top_n)),
111 bbox_inches="tight", dpi=300)
112
113 # Significance vs fold enrichment over background
114 enrichment_table.loc[:, 'enrichment_over_background'] = (
115 enrichment_table['% of Target Sequences with Motif'] /
116 enrichment_table['% of Background Sequences with Motif'])
117
118 n = len(enrichment_table[comp_variable].drop_duplicates())
119 n_side = int(np.ceil(np.sqrt(n)))
120 fig, axis = plt.subplots(n_side, n_side, figsize=(3 * n_side, 3 * n_side), sharex=False, sharey=False)
121 axis = axis.flatten()
122 for i, comp in enumerate(enrichment_table[comp_variable].drop_duplicates().sort_values()):
123 enr = enrichment_table[enrichment_table[comp_variable] == comp]
124 enr.loc[:, 'Motif Name'] = enr['Motif Name'].str.replace(".*BestGuess:", "").str.replace(r"-ChIP-Seq.*", "")
125
126 enr.loc[:, 'combined'] = enr[['enrichment_over_background', 'log_p_value']].apply(zscore).mean(axis=1)
127 cs = axis[i].scatter(
128 enr['enrichment_over_background'],
129 enr['log_p_value'],
130 c=enr['combined'],
131 s=8, alpha=0.75)
132
133 # label top points
134 for j in enr['combined'].sort_values().tail(5).index:
135 axis[i].text(
136 enr.loc[j, 'enrichment_over_background'],
137 enr.loc[j, 'log_p_value'],
138 s=enr.loc[j, "Motif Name"], ha="right", fontsize=5)
139 axis[i].set_title(comp)
140
141 for ax in axis.reshape((n_side, n_side))[:, 0]:
142 ax.set_ylabel("-log10(p-value)")
143 for ax in axis.reshape((n_side, n_side))[-1, :]:
144 ax.set_xlabel("Enrichment over background")
145 sns.despine(fig)
146 fig.savefig(
147 os.path.join(output_dir, output_prefix + ".homer_consensus.scatterplot.svg"),
148 bbox_inches="tight", dpi=300)
149
150 # Plot heatmaps of terms for each comparison
151 if len(enrichment_table[comp_variable].drop_duplicates()) < 2:
152 return
153
154 for label, metric in [
155 ('-log10(p-value) of enrichment\nin', 'log_p_value'),
156 ('Enrichment over background of\n', 'enrichment_over_background')]:
157 # pivot table
158 motifs_pivot = pd.pivot_table(
159 enrichment_table, values=metric, columns="Motif Name", index=comp_variable).fillna(0)
160
161 # plot correlation
162 if correlation_plots:
163 g = sns.clustermap(motifs_pivot.T.corr(), cbar_kws={"label": "Correlation of enrichemnt\nof differential regions"})
164 g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90)
165 g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0)
166 g.fig.savefig(os.path.join(output_dir, output_prefix + ".homer_consensus.correlation.svg"), bbox_inches="tight", dpi=300)
167
168 top = enrichment_table.set_index('Motif Name').groupby(comp_variable)[metric].nlargest(top_n)
169 top_terms = top.index.get_level_values('Motif Name').unique()
170
171 # plot clustered heatmap
172 shape = motifs_pivot.loc[:, top_terms].shape
173 g = sns.clustermap(
174 motifs_pivot.loc[:, top_terms].T, figsize=(max(6, 0.12 * shape[0]), max(6, 0.12 * shape[1])),
175 xticklabels=True, yticklabels=True,
176 cbar_kws={"label": label + " differential regions"}, metric="correlation")
177 g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right", fontsize="xx-small")
178 g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize="xx-small")
179 g.fig.savefig(os.path.join(output_dir, output_prefix + ".homer_consensus.cluster_specific.{}.svg".format(metric)), bbox_inches="tight", dpi=300)
180
181 # plot clustered heatmap
182 shape = motifs_pivot.loc[:, top_terms].shape
183 if z_score is not None:
184 g = sns.clustermap(
185 motifs_pivot.loc[:, top_terms].T, figsize=(max(6, 0.12 * shape[0]), max(6, 0.12 * shape[1])),
186 xticklabels=True, yticklabels=True,
187 cmap="RdBu_r", center=0, z_score=z_score,
188 cbar_kws={"label": "{} Z-score of {} differential regions".format(z_score_label, label)}, metric="correlation")
189 g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right", fontsize="xx-small")
190 g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize="xx-small")
191 g.fig.savefig(
192 os.path.join(output_dir, output_prefix + ".homer_consensus.cluster_specific.{}.{}_z_score.svg".format(metric, z_score_label)),
193 bbox_inches="tight", dpi=300)