# The purpose of this script is to
# 1) Transform the merged data (stringTie, featureCounts) for dimensionality reduction
# 2) Scale with MinMaxScaler
# 3) Plot with PCA, tSNE, UMAP
# imports
import os
import click
import numpy as np
import pandas as pd
from random import randint
from fileUtils.file_handling import read_tpm
import plotly.express as px
from time import time
from sklearn import preprocessing
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap # = umap-learn
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import OrderedDict
@click.command()
@click.option('-i', '--inpath', required=False,
help='Please enter path to file with merged TPM/featureCounts table without replica')
@click.option('--merged_replica', required=False,
help='Please enter path to merged_gene_counts_table.txt with merged replica merged')
@click.option('-m', '--metadata', prompt='path to metadata', required=True,
help='Path to file with metadata')
@click.option('-o', '--outpath', prompt='path to image folder', required=True)
@click.option('--pca/--no-pca', help='perform PCA analysis', default=False)
@click.option('--tsne/--no-tsne', help='perform t-SNE analysis', default=False)
@click.option('--umap/--no-umap', help='perform UMAP analysis', default=False)
@click.option('--comparison/--no-comparison', help='perform Comparison analysis', default=False)
@click.option('--silhouette/--no-silhouette', help='perform Silhouette analysis', default=False)
@click.option('-t', '--title', prompt='figure title', help='enter title for figure', required=True)
def main(inpath, merged_replica, metadata, outpath, pca, tsne, umap, comparison, silhouette, title):
if inpath:
print("Reading data...")
sample_ids, gene_ids, feature_names, data = read_tpm(inpath)
# convert data to numpy array for scaling and transpose to scale on genes
# shape = (features, samples)
new_data = np.array(data, dtype=float)
# Transpose data for Dimensionality reduction
# shape = (samples, features)
print("Transposing data...")
dataT = np.transpose(new_data)
# scaling on the "columns" = per gene feature such that gene expr. values between (-1,1)
print("Scaling data...")
scaled_data = scaling(dataT)
print("Reading metadata...")
meta, target_names, target, annotations, project, pcolor_list, ccolor_list = read_metadata(metadata, sample_ids)
if merged_replica:
df = pd.read_csv(merged_replica, sep="\t")
case_ids = df.iloc[:, 0].tolist()
target_names = df.iloc[:, 1].tolist()
target = df.iloc[:, 2].tolist()
project_series = df.iloc[:, 3]
project_set = set(project_series)
project_dict = {v: 0 + k for k, v in enumerate(project_set)}
project_arr = [project_dict[k] for k in project_series]
unique_ids = df.iloc[:, 4].tolist()
df = df.drop(['CaseID', 'SampleType', 'Condition', 'Project'], axis=1)
cdict = {0: '#EF553B', 1: '#636EFA'}
ccolor_list = [cdict[k] for k in target]
plist = []
for i in range(len(project_set)):
plist.append('#{:06x}'.format(randint(0, 256 ** 3)))
pdict = dict(zip(project_set, plist))
pcolor_list = project_series.map(pdict)
annotations = pd.DataFrame({'ID': unique_ids, 'Condition': target_names,
'CaseID': case_ids, 'Project': project_series})
annotations['ID'] = annotations['ID'].astype(str)
df.set_index(['UniqueID'], inplace=True)
scaled_data = scaling(df)
print("Plotly comparison...")
plotly_comparison(scaled_data, pcolor_list, ccolor_list, annotations, outpath, title)
# Plotting
if pca:
visualization_plots(scaled_data, annotations, outpath, 'pca', title)
if tsne:
visualization_plots(scaled_data, annotations, outpath, 'tsne', title)
if umap:
visualization_plots(scaled_data, annotations, outpath, 'umap', title)
if comparison:
comparison_dim_reduction(scaled_data, pcolor_list, outpath + "project_")
comparison_dim_reduction(scaled_data, ccolor_list, outpath + "condition_")
if silhouette:
silhouette_plot(scaled_data, target)
print("Silhouette Coefficient: ", silhouette_score(scaled_data, target))
print(silhouette_samples(scaled_data, target))
[docs]def scaling(data):
"""Scaling with MinMax to range (-1,1)
:param data: numpy array
:return: scaled_data
"""
scaler = preprocessing.MinMaxScaler()
# scales in the range of -1 to 1
scaled_data = scaler.fit_transform(data, (-1, 1))
return scaled_data
[docs]def visualization_plots(data, target, outpath, method=str, title_dataset=str):
"""Creates interactive plotly graphs with reduced dimensions with tooltip displaying metadata
:param data: numpy array
:param target: list
:param outpath: to folder for images .png, .html
:param method: one of [pca, tsne, umap]
:param title_dataset: i.e. Pancreas ComBat corrected
"""
# project color codes
colors = ['#85660D', '#782AB6', '#565656', '#1C8356', '#16FF32', '#F7E1A0', '#E2E2E2', '#1CBE4F', '#C4451C',
'#DEA0FD', '#FE00FA', '#325A9B', '#FEAF16', '#F8A19F', '#90AD1C', '#F6222E', '#1CFFCE', '#2ED9FF',
'#B10DA1', '#C075A6', '#FC1CBF', '#B00068', '#FBE426', '#FA0087', '#EF553B', '#636EFA', '#00CC96',
'#AB63FA']
colors2 = ['#EF553B', '#636EFA', '#00CC96', '#AB63FA'] # red, blue, green purple
if method == 'pca':
print("Performing pca for plotting...")
pca = PCA(n_components=2)
X_pca = pca.fit_transform(data)
d1 = X_pca[:, 0]
d2 = X_pca[:, 1]
evr1 = pca.explained_variance_ratio_[0]
evr2 = pca.explained_variance_ratio_[1]
if method == 'tsne':
print("Performing tsne for plotting...")
# Dimension reduction
# optional: init='pca', perplexity (neighbors), learning_rate
tsne = TSNE(n_components=2, random_state=0)
X_embedded = tsne.fit_transform(data)
d1 = X_embedded[:, 0]
d2 = X_embedded[:, 1]
if method == 'umap':
print("Performing umap for plotting...")
# Dimension reduction
# ensure reproducibility of embedding by setting random state, n_neighbors, metric='mahalanobis'
# optional n_neighbors=30, min_dist=0
reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(data)
d1 = embedding[:, 0]
d2 = embedding[:, 1]
# Plotting by conditions C and projects P
figC = px.scatter(x=d1, y=d2, color=target.Condition, hover_name=target.ID,
hover_data={'project': target.Project, 'case_id': target.CaseID},
template="simple_white", color_discrete_sequence=['#636EFA', '#EF553B']) # px.colors.qualitative.Plotly
figP = px.scatter(x=d1, y=d2, color=target.Project, hover_name=target.ID,
hover_data={'condition': target.Condition, 'case_id': target.CaseID},
template="simple_white", color_discrete_sequence=colors2)
if method == 'pca':
figC.update_layout(title=str(method).upper() + ' - ' + title_dataset + ' dataset',
xaxis_title="PC1 - explained variance: " + str(round(evr1 * 100, 2)),
yaxis_title="PC2 - explained variance: " + str(round(evr2 * 100, 2)),
legend_title="Condition") # , colorway= ['#636EFA', '#EF553B']) # blue, red
figP.update_layout(title=str(method).upper() + ' - ' + title_dataset + ' dataset',
xaxis_title="PC1 - explained variance: " + str(round(evr1 * 100, 2)),
yaxis_title="PC2 - explained variance: " + str(round(evr2 * 100, 2)),
legend_title="Project", colorway=colors2)
else:
figC.update_layout(title=str(method).upper() + ' - ' + title_dataset + ' dataset',
xaxis_title="Dim1", yaxis_title="Dim2",
legend_title="Condition") # colorway=px.colors.qualitative.Plotly# ['#636EFA', '#EF553B']) # blue, red
figP.update_layout(title=str(method).upper() + ' - ' + title_dataset + ' dataset',
xaxis_title="Dim1", yaxis_title="Dim2", legend_title="Project",
colorway=colors2)
# control location of legend with yanchor="top", y=0.99, xanchor="left", x=0.01,
# orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
figC.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, font=dict(size=20)))
figP.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, font=dict(size=20)))
figC.update_traces(marker={'size': 20})
figC.show()
figP.update_traces(marker={'size': 20})
figP.show()
# export to html
if not os.path.exists(outpath):
os.mkdir(outpath)
figC.write_html(outpath + str(method) + '_' + title_dataset.lower() + "_condition.html")
figP.write_html(outpath + str(method) + '_' + title_dataset.lower() + "_project.html")
# reduce marker sizes for pdf pictures
figC.update_traces(marker={'size': 10})
figP.update_traces(marker={'size': 10})
figC.update_xaxes(linewidth=3, title_font_size=20, tickfont_size=20)
figC.update_yaxes(linewidth=3, title_font_size=20, tickfont_size=20)
figP.update_xaxes(linewidth=3, title_font_size=20, tickfont_size=20)
figC.update_yaxes(linewidth=3, title_font_size=20, tickfont_size=20)
figC.write_image(outpath + str(method) + '_' + title_dataset.lower() + "_condition.pdf", scale=3)
figP.write_image(outpath + str(method) + '_' + title_dataset.lower() + "_project.pdf", scale=3)
[docs]def comparison_dim_reduction(X, target, outpath):
"""Main code from scikit-learn to create comparative plots with different reduction methods
:param X: data
:param target: list
:param outpath: to store image .png
"""
# Create figure
fig = plt.figure(figsize=(15, 5))
# uncomment to plot title
# fig.suptitle("Dimensionality reduction with ", fontsize=14)
n_components = 2
# Set-up manifold methods
methods = OrderedDict()
methods['PCA'] = PCA(n_components=n_components, random_state=0)
# init='pca'
methods['t-SNE'] = TSNE(n_components=n_components, random_state=0)
methods['UMAP'] = umap.UMAP(random_state=42)
# Plot results
for i, (label, method) in enumerate(methods.items()):
t0 = time()
Y = method.fit_transform(X)
t1 = time()
print("%s: %.2g sec" % (label, t1 - t0))
ax = fig.add_subplot(1, 3, 1 + i)
ax.scatter(Y[:, 0], Y[:, 1], c=target) # , cmap=plt.cm.Spectral), cmap=plt.cm.Set2
# ax.set_title("%s (%.2g sec)" % (label, t1 - t0))
ax.set_title("%s" % label, fontsize=16, fontweight='bold')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
if label == 'PCA':
evr1 = method.explained_variance_ratio_[0]
evr2 = method.explained_variance_ratio_[1]
ax.set_xlabel("PC1 - " + str(round(evr1 * 100, 2)) + "%", fontsize=20)
ax.set_ylabel("PC2 - " + str(round(evr2 * 100, 2)) + "%", fontsize=20)
else:
ax.set_xlabel('Dim1', fontsize=20)
ax.set_ylabel('Dim2', fontsize=20)
ax.tick_params(axis='both', which='major', labelsize=20)
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
ax.axis('tight')
if not os.path.exists(outpath):
os.mkdir(outpath)
plt.savefig(outpath + "comparison_plot.pdf", format='pdf')
[docs]def plotly_comparison(data, pcolor_list, ccolor_list, annos, outpath, title_dataset):
"""Plot all dimensionality reduction techniques into one plotly plot as HTML and PDF
:param data: numpy array
:param pcolor_list: list of project colors
:param ccolor_list: list of condition colors
:param annos: annotation dataframe: FileID, CaseID, SampleType, Project
:param outpath: for images
:param title_dataset: for output path
:return: None
"""
pca = PCA(n_components=2)
X_pca = pca.fit_transform(data)
d1 = X_pca[:, 0]
d2 = X_pca[:, 1]
evr1 = pca.explained_variance_ratio_[0]
evr2 = pca.explained_variance_ratio_[1]
tsne = TSNE(n_components=2, random_state=0, perplexity=20) # , perplexity=20
X_embedded = tsne.fit_transform(data)
t1 = X_embedded[:, 0]
t2 = X_embedded[:, 1]
reducer = umap.UMAP(random_state=42, n_neighbors=30, min_dist=0.0) # , n_neighbors=30, min_dist=0.0
embedding = reducer.fit_transform(data)
u1 = embedding[:, 0]
u2 = embedding[:, 1]
fig = make_subplots(rows=2, cols=3, subplot_titles=("PCA", "t-SNE", "UMAP"))
fig.add_trace(go.Scatter(x=d1, y=d2, mode='markers', marker=dict(color=pcolor_list)), row=1, col=1)
fig.add_trace(go.Scatter(x=t1, y=t2, mode='markers', marker=dict(color=pcolor_list)), row=1, col=2)
fig.add_trace(go.Scatter(x=u1, y=u2, mode='markers', marker=dict(color=pcolor_list)), row=1, col=3)
fig.update_traces(hovertext='ID: ' + annos.ID + '<br>' + 'Project: ' + annos.Project + '<br>' + 'Condition: ' +
annos.Condition + '<br>' + 'CaseID: ' + annos.CaseID)
# fig.update_traces(marker_color=target)
fig.add_trace(go.Scatter(x=d1, y=d2, mode='markers', marker=dict(color=ccolor_list)), row=2, col=1)
fig.add_trace(go.Scatter(x=t1, y=t2, mode='markers', marker=dict(color=ccolor_list)), row=2, col=2)
fig.add_trace(go.Scatter(x=u1, y=u2, mode='markers', marker=dict(color=ccolor_list)), row=2, col=3)
fig.update_traces(hovertext='ID: ' + annos.ID + '<br>' + 'Project: ' + annos.Project + '<br>' + 'Condition: ' +
annos.Condition + '<br>' + 'CaseID: ' + annos.CaseID)
# fig.update_traces(marker_color=conditions)
fig.update_layout(template="simple_white", width=2000, height=1200, showlegend=False,
title_text='Dimensionality Reduction Comparison')
fig.show()
if not os.path.exists(outpath):
os.mkdir(outpath)
fig.write_html(outpath + "comparison_" + title_dataset.lower() + ".html")
fig.write_image(
outpath + "comparison_" + title_dataset.lower() + ".pdf") # plotly-orca not working --> now using kaleido
[docs]def silhouette_plot(data, target):
"""Determine good number of clusters
:param data: numpy array
:param target: list
"""
fig, ax = plt.subplots()
ax.set_title("The silhouette plot")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
# The subplot is the silhouette plot
ax.set_xlim([-0.1, 1])
lower = 10
clusters = ['normal', 'tumor']
target = np.array(target)
# UMAP
reducer = umap.UMAP(random_state=10)
embedding = reducer.fit_transform(data)
for i, condition in enumerate(clusters):
# Compute the silhouette score average
silhouette_avg = silhouette_score(embedding, target)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(embedding, target)
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[target == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
upper = lower + size_cluster_i
color = cm.nipy_spectral(float(i) / 2)
# plotting the silhouette clusters
ax.fill_betweenx(np.arange(lower, upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax.text(-0.05, lower + 0.5 * size_cluster_i, condition)
# shift by 10 to plot space between silhouettes
lower = upper + 10 # 10 for the 0 samples
# The vertical line for average silhouette score of all the values
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
ax.set_yticks([]) # Clear the yaxis labels / ticks
ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
fig.show()
if __name__ == "__main__":
main()