11ARwE3L

· 7 years ago · Dec 13, 2018, 01:18 PM
1def plot_differential_enrichment(
2        enrichment_table,
3        enrichment_type,
4        data_type="ATAC-seq",
5        direction_dependent=True,
6        output_dir="results/differential_analysis_{data_type}/enrichments",
7        comp_variable="comparison_name",
8        output_prefix="differential_analysis",
9        rasterized=True,
10        barplots=True,
11        correlation_plots=True,
12        clustermap_metric='correlation',
13        top_n=5,
14        z_score=0,
15        cmap=None):
16    """
17    Given a table of enrichment terms across several comparisons, produced
18    plots illustrating these enrichments in the various comparisons.
19
20    # TODO: add plotting of genomic region and chromatin state enrichment.
21    `comp_variable` is the column in the enrichment table that labels groups.
22
23    # TODO: split function in its smaller parts and call them appropriately.
24
25    :param enrichment_table: Data frame with enrichment results as produced by
26                             ``ngs_toolkit.general.differential_enrichment`` or
27                             ``ngs_toolkit.general.collect_differential_enrichment``.
28    :type enrichment_table: pandas.DataFrame
29    :param enrichment_type: One of 'lola', 'enrichr', 'motif', 'plot_differential_enrichment', great'.
30    :type enrichment_type: str
31    :param data_type: Data type. One of "ATAC-seq" and "RNA-seq". Defaults to "ATAC-seq".
32    :type data_type: str, optional
33    :param direction_dependent: Whether enrichments were made in a direction-dependent way
34                                (up-regulated and down-regulated features separately).
35                                This implies a column named "direction" exists". Defaults to True.
36    :type direction_dependent: bool, optional
37    :param output_dir: Directory to create output files. Defaults to "results/differential_analysis_{data_type}/enrichments".
38    :type output_dir: str, optional
39    :param comp_variable: Column defining which comparison enrichment terms belong to. Defaults to "comparison_name".
40    :type comp_variable: str, optional
41    :param output_prefix: Prefix to use when creating output files. Defaults to "differential_analysis".
42    :type output_prefix: str, optional
43    :param rasterized: Whether or not to rasterize heatmaps for efficient plotting. Defaults to True.
44    :type rasterized: bool, optional
45    :param barplots: Whether barplots with top enriched terms per comparison should be produced. Defaults to True.
46    :type barplots: bool, optional
47    :param correlation_plots: Whether correlation plots of comparisons across enriched terms should be produced. Defaults to True.
48    :type correlation_plots: bool, optional
49    :param clustermap_metric: Distance metric to use for clustermap clustering, Default to "correlation" (Pearson's).
50    :type clustermap_metric: str, optional
51    :param top_n: Top terms to be used to make barplots. Defaults to 5
52    :type top_n: number, optional
53    :param z_score: Which dimention/axis to perform Z-score transformation for. Numpy/Pandas conventions are used:
54                    `0` is row-wise (in this case across comparisons) and `1` is column-wise (across terms). Defaults to 0.
55    :type z_score: number, optional
56    :param cmap: Colormap to use in heatmaps. Default None.
57    :type cmap: str, optional
58    """
59    import numpy as np
60    import matplotlib
61    import matplotlib.pyplot as plt
62    import seaborn as sns
63    from scipy.stats import zscore
64
65    if enrichment_type not in ["homer_consensus"']:
66        raise AssertionError("`enrichment_type` must be 'homer_consensus'.")
67
68    if "{data_type}" in output_dir:
69        output_dir = output_dir.format(data_type=data_type)
70    if not os.path.exists(output_dir):
71        os.makedirs(output_dir)
72
73    if z_score == 0:
74        z_score_label = "Row"
75    elif z_score == 1:
76        z_score_label = "Column"
77    elif z_score is None:
78        pass
79    else:
80        raise ValueError("Argument 'z_score' must be on of 0, 1 or None.")
81
82    enrichment_table = enrichment_table.copy()
83    if "direction" in enrichment_table.columns and direction_dependent:
84        enrichment_table[comp_variable] = enrichment_table[comp_variable].astype(
85            str) + " " + enrichment_table["direction"].astype(str)
86
87    if enrichment_type == "homer_consensus":
88        enrichment_table.loc[:, "log_p_value"] = -(enrichment_table["Log P-value"].astype(np.float64))
89        # Plot top_n terms of each comparison in barplots
90        top_n = min(top_n, enrichment_table.set_index("Motif Name").groupby(comp_variable)["log_p_value"].count().min() - 1)
91        top_data = enrichment_table.set_index("Motif Name").groupby(comp_variable)["log_p_value"].nlargest(top_n).reset_index()
92
93        if barplots:
94            n = len(enrichment_table[comp_variable].drop_duplicates())
95            n_side = int(np.ceil(np.sqrt(n)))
96
97            fig, axis = plt.subplots(n_side, n_side, figsize=(4 * n_side, n_side * max(5, 0.12 * top_n)), sharex=False, sharey=False)
98            axis = iter(axis.flatten())
99            for i, comp in enumerate(top_data[comp_variable].drop_duplicates().sort_values()):
100                df2 = top_data.loc[top_data[comp_variable] == comp, :]
101                ax = next(axis)
102                sns.barplot(
103                    df2["log_p_value"], df2["Motif Name"], estimator=max,
104                    orient="horizontal", ax=ax, color=sns.color_palette("colorblind")[0])
105                ax.set_title(comp)
106            for ax in axis:
107                ax.set_visible(False)
108            sns.despine(fig)
109            fig.savefig(
110                os.path.join(output_dir, output_prefix + ".homer_consensus.barplot.top_{}.svg".format(top_n)),
111                bbox_inches="tight", dpi=300)
112
113        # Significance vs fold enrichment over background
114        enrichment_table.loc[:, 'enrichment_over_background'] = (
115            enrichment_table['% of Target Sequences with Motif'] /
116            enrichment_table['% of Background Sequences with Motif'])
117
118        n = len(enrichment_table[comp_variable].drop_duplicates())
119        n_side = int(np.ceil(np.sqrt(n)))
120        fig, axis = plt.subplots(n_side, n_side, figsize=(3 * n_side, 3 * n_side), sharex=False, sharey=False)
121        axis = axis.flatten()
122        for i, comp in enumerate(enrichment_table[comp_variable].drop_duplicates().sort_values()):
123            enr = enrichment_table[enrichment_table[comp_variable] == comp]
124            enr.loc[:, 'Motif Name'] = enr['Motif Name'].str.replace(".*BestGuess:", "").str.replace(r"-ChIP-Seq.*", "")
125
126            enr.loc[:, 'combined'] = enr[['enrichment_over_background', 'log_p_value']].apply(zscore).mean(axis=1)
127            cs = axis[i].scatter(
128                enr['enrichment_over_background'],
129                enr['log_p_value'],
130                c=enr['combined'],
131                s=8, alpha=0.75)
132
133            # label top points
134            for j in enr['combined'].sort_values().tail(5).index:
135                axis[i].text(
136                    enr.loc[j, 'enrichment_over_background'],
137                    enr.loc[j, 'log_p_value'],
138                    s=enr.loc[j, "Motif Name"], ha="right", fontsize=5)
139            axis[i].set_title(comp)
140
141        for ax in axis.reshape((n_side, n_side))[:, 0]:
142            ax.set_ylabel("-log10(p-value)")
143        for ax in axis.reshape((n_side, n_side))[-1, :]:
144            ax.set_xlabel("Enrichment over background")
145        sns.despine(fig)
146        fig.savefig(
147            os.path.join(output_dir, output_prefix + ".homer_consensus.scatterplot.svg"),
148            bbox_inches="tight", dpi=300)
149
150        # Plot heatmaps of terms for each comparison
151        if len(enrichment_table[comp_variable].drop_duplicates()) < 2:
152            return
153
154        for label, metric in [
155                ('-log10(p-value) of enrichment\nin', 'log_p_value'),
156                ('Enrichment over background of\n', 'enrichment_over_background')]:
157            # pivot table
158            motifs_pivot = pd.pivot_table(
159                enrichment_table, values=metric, columns="Motif Name", index=comp_variable).fillna(0)
160
161            # plot correlation
162            if correlation_plots:
163                g = sns.clustermap(motifs_pivot.T.corr(), cbar_kws={"label": "Correlation of enrichemnt\nof differential regions"})
164                g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90)
165                g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0)
166                g.fig.savefig(os.path.join(output_dir, output_prefix + ".homer_consensus.correlation.svg"), bbox_inches="tight", dpi=300)
167
168            top = enrichment_table.set_index('Motif Name').groupby(comp_variable)[metric].nlargest(top_n)
169            top_terms = top.index.get_level_values('Motif Name').unique()
170
171            # plot clustered heatmap
172            shape = motifs_pivot.loc[:, top_terms].shape
173            g = sns.clustermap(
174                motifs_pivot.loc[:, top_terms].T, figsize=(max(6, 0.12 * shape[0]), max(6, 0.12 * shape[1])),
175                xticklabels=True, yticklabels=True,
176                cbar_kws={"label": label + " differential regions"}, metric="correlation")
177            g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right", fontsize="xx-small")
178            g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize="xx-small")
179            g.fig.savefig(os.path.join(output_dir, output_prefix + ".homer_consensus.cluster_specific.{}.svg".format(metric)), bbox_inches="tight", dpi=300)
180
181            # plot clustered heatmap
182            shape = motifs_pivot.loc[:, top_terms].shape
183            if z_score is not None:
184                g = sns.clustermap(
185                    motifs_pivot.loc[:, top_terms].T, figsize=(max(6, 0.12 * shape[0]), max(6, 0.12 * shape[1])),
186                    xticklabels=True, yticklabels=True,
187                    cmap="RdBu_r", center=0, z_score=z_score,
188                    cbar_kws={"label": "{} Z-score of {} differential regions".format(z_score_label, label)}, metric="correlation")
189                g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right", fontsize="xx-small")
190                g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize="xx-small")
191                g.fig.savefig(
192                    os.path.join(output_dir, output_prefix + ".homer_consensus.cluster_specific.{}.{}_z_score.svg".format(metric, z_score_label)),
193                    bbox_inches="tight", dpi=300)