Source code for dim_reduction

# The purpose of this script is to
# 1) Transform the merged data (stringTie, featureCounts) for dimensionality reduction
# 2) Scale with MinMaxScaler
# 3) Plot with PCA, tSNE, UMAP

# imports
import os
import click
import numpy as np
import pandas as pd
from random import randint

from fileUtils.file_handling import read_tpm
import plotly.express as px
from time import time
from sklearn import preprocessing
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap  # = umap-learn
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import OrderedDict


@click.command()
@click.option('-i', '--inpath', required=False,
              help='Please enter path to file with merged TPM/featureCounts table without replica')
@click.option('--merged_replica', required=False,
              help='Please enter path to merged_gene_counts_table.txt with merged replica merged')
@click.option('-m', '--metadata', prompt='path to metadata', required=True,
              help='Path to file with metadata')
@click.option('-o', '--outpath', prompt='path to image folder', required=True)
@click.option('--pca/--no-pca', help='perform PCA analysis', default=False)
@click.option('--tsne/--no-tsne', help='perform t-SNE analysis', default=False)
@click.option('--umap/--no-umap', help='perform UMAP analysis', default=False)
@click.option('--comparison/--no-comparison', help='perform Comparison analysis', default=False)
@click.option('--silhouette/--no-silhouette', help='perform Silhouette analysis', default=False)
@click.option('-t', '--title', prompt='figure title', help='enter title for figure', required=True)
def main(inpath, merged_replica, metadata, outpath, pca, tsne, umap, comparison, silhouette, title):
    if inpath:
        print("Reading data...")
        sample_ids, gene_ids, feature_names, data = read_tpm(inpath)

        # convert data to numpy array for scaling and transpose to scale on genes
        # shape = (features, samples)
        new_data = np.array(data, dtype=float)

        # Transpose data for Dimensionality reduction
        # shape = (samples, features)
        print("Transposing data...")
        dataT = np.transpose(new_data)
        # scaling on the "columns" = per gene feature such that gene expr. values between (-1,1)
        print("Scaling data...")
        scaled_data = scaling(dataT)
        print("Reading metadata...")
        meta, target_names, target, annotations, project, pcolor_list, ccolor_list = read_metadata(metadata, sample_ids)

    if merged_replica:

        df = pd.read_csv(merged_replica, sep="\t")
        case_ids = df.iloc[:, 0].tolist()
        target_names = df.iloc[:, 1].tolist()
        target = df.iloc[:, 2].tolist()

        project_series = df.iloc[:, 3]
        project_set = set(project_series)
        project_dict = {v: 0 + k for k, v in enumerate(project_set)}
        project_arr = [project_dict[k] for k in project_series]

        unique_ids = df.iloc[:, 4].tolist()
        df = df.drop(['CaseID', 'SampleType', 'Condition', 'Project'], axis=1)

        cdict = {0: '#EF553B', 1: '#636EFA'}
        ccolor_list = [cdict[k] for k in target]

        plist = []
        for i in range(len(project_set)):
            plist.append('#{:06x}'.format(randint(0, 256 ** 3)))
        pdict = dict(zip(project_set, plist))
        pcolor_list = project_series.map(pdict)

        annotations = pd.DataFrame({'ID': unique_ids, 'Condition': target_names,
                                    'CaseID': case_ids, 'Project': project_series})
        annotations['ID'] = annotations['ID'].astype(str)

        df.set_index(['UniqueID'], inplace=True)
        scaled_data = scaling(df)

    print("Plotly comparison...")
    plotly_comparison(scaled_data, pcolor_list, ccolor_list, annotations, outpath, title)

    # Plotting
    if pca:
        visualization_plots(scaled_data, annotations, outpath, 'pca', title)
    if tsne:
        visualization_plots(scaled_data, annotations, outpath, 'tsne', title)
    if umap:
        visualization_plots(scaled_data, annotations, outpath, 'umap', title)

    if comparison:
        comparison_dim_reduction(scaled_data, pcolor_list, outpath + "project_")
        comparison_dim_reduction(scaled_data, ccolor_list, outpath + "condition_")

    if silhouette:
        silhouette_plot(scaled_data, target)
        print("Silhouette Coefficient: ", silhouette_score(scaled_data, target))
        print(silhouette_samples(scaled_data, target))


[docs]def read_metadata(metadata, sample_ids): """Read in metadata file csv format FileID, CaseID, SampleType, Project :return: meta_dict :return: target_names: dict :return: target: array [0,1,0,0,1,1,1,0...] :return: annotation: pd.DataFrame {ID, Condition, CaseID, Project} :return: project_arr: numerical encoding of projects [0,1,1,2,0,3,...] :return: color_list: color list for conditions :return: pcolor_list: project color list """ meta_dict = {} with open(metadata, 'r') as file: for line in file.readlines()[1:]: fields = line.strip().split(",") file_id = fields[0].strip() case = fields[1].strip() sample_type = fields[2].strip() project = fields[3].strip() if file_id not in meta_dict: meta_dict[file_id] = [case, sample_type, project] # dict mapping sample_ids to target conditions as strings target_names = {} for i, e in enumerate(sample_ids): if e not in target_names.keys(): target_names[e] = meta_dict[e][1] # encode conditions as target array of 0,1 target = [0 if value == 'normal' else 1 for key, value in target_names.items()] # Create Dataframe object for annotations for plots # sort the metadata dictionary according to order of sample_target_names sorted_meta = OrderedDict([(el, meta_dict[el]) for el in target_names]) tmp = {0 + i: [k, sorted_meta.get(k)] for i, k in enumerate(sorted_meta)} annos = pd.DataFrame.from_dict(tmp, orient='index') # expand list with metadata into own series tags = annos.iloc[:, 1].apply(pd.Series) tags.rename({0: 'CaseID', 1: 'Condition', 2: 'Project'}, axis=1, inplace=True) # concat dataframes annotations = pd.concat([annos[:], tags[:]], axis=1) annotations.drop([1], axis=1, inplace=True) annotations.rename({0: 'ID'}, axis=1, inplace=True) # get projects as different arrays, dicts project_series = annotations.iloc[:, 3] project_set = set(project_series) project_dict = {v: 0 + k for k, v in enumerate(project_set)} project_arr = [project_dict[k] for k in project_series] # define colors for projects pdict = {'GTEx-PRJNA75899': '#EF553B', 'TCGA-PAAD': '#AB63FA', 'PACA-AU': '#00CC96', 'PACA-CA': '#636EFA'} # pdict = {'SRP030040': '#85660D', 'SRP058626': '#782AB6', 'SRP187978': '#565656', 'SRP050003': '#1C8356', # 'SRP029880': '#16FF32', 'SRP137150': '#F7E1A0', 'SRP102722': '#E2E2E2', 'SRP026600': '#1CBE4F', # 'SRP064138': '#C4451C', 'SRP068551': '#DEA0FD', 'SRP062885': '#FE00FA', 'SRP069212': '#325A9B', # 'SRP048907': '#FEAF16', 'SRP050551': '#F8A19F', 'SRP076032': '#90AD1C', 'SRP068976': '#F6222E', # 'SRP070723': '#1CFFCE', 'SRP108560': '#2ED9FF', 'SRP056696': '#B10DA1', 'SRP040998': '#C075A6', # 'SRP118972': '#FC1CBF', 'SRP039694': '#B00068', 'SRP174502': '#FBE426', 'SRP049592': '#FA0087', # 'PRJNA75899': '#00CC96', 'TCGA-LIHC': '#EF553B', 'LIRI-JP': '#636EFA', 'TCGA-CHOL': '#AB63FA'} pcolor_list = project_series.map(pdict) # target condition colors, blue for normal tissue, red for tumor cdict = {0: '#636EFA', 1: '#EF553B'} ccolor_list = [cdict[k] for k in target] return meta_dict, target_names, target, annotations, project_arr, pcolor_list, ccolor_list
[docs]def scaling(data): """Scaling with MinMax to range (-1,1) :param data: numpy array :return: scaled_data """ scaler = preprocessing.MinMaxScaler() # scales in the range of -1 to 1 scaled_data = scaler.fit_transform(data, (-1, 1)) return scaled_data
[docs]def visualization_plots(data, target, outpath, method=str, title_dataset=str): """Creates interactive plotly graphs with reduced dimensions with tooltip displaying metadata :param data: numpy array :param target: list :param outpath: to folder for images .png, .html :param method: one of [pca, tsne, umap] :param title_dataset: i.e. Pancreas ComBat corrected """ # project color codes colors = ['#85660D', '#782AB6', '#565656', '#1C8356', '#16FF32', '#F7E1A0', '#E2E2E2', '#1CBE4F', '#C4451C', '#DEA0FD', '#FE00FA', '#325A9B', '#FEAF16', '#F8A19F', '#90AD1C', '#F6222E', '#1CFFCE', '#2ED9FF', '#B10DA1', '#C075A6', '#FC1CBF', '#B00068', '#FBE426', '#FA0087', '#EF553B', '#636EFA', '#00CC96', '#AB63FA'] colors2 = ['#EF553B', '#636EFA', '#00CC96', '#AB63FA'] # red, blue, green purple if method == 'pca': print("Performing pca for plotting...") pca = PCA(n_components=2) X_pca = pca.fit_transform(data) d1 = X_pca[:, 0] d2 = X_pca[:, 1] evr1 = pca.explained_variance_ratio_[0] evr2 = pca.explained_variance_ratio_[1] if method == 'tsne': print("Performing tsne for plotting...") # Dimension reduction # optional: init='pca', perplexity (neighbors), learning_rate tsne = TSNE(n_components=2, random_state=0) X_embedded = tsne.fit_transform(data) d1 = X_embedded[:, 0] d2 = X_embedded[:, 1] if method == 'umap': print("Performing umap for plotting...") # Dimension reduction # ensure reproducibility of embedding by setting random state, n_neighbors, metric='mahalanobis' # optional n_neighbors=30, min_dist=0 reducer = umap.UMAP(random_state=42) embedding = reducer.fit_transform(data) d1 = embedding[:, 0] d2 = embedding[:, 1] # Plotting by conditions C and projects P figC = px.scatter(x=d1, y=d2, color=target.Condition, hover_name=target.ID, hover_data={'project': target.Project, 'case_id': target.CaseID}, template="simple_white", color_discrete_sequence=['#636EFA', '#EF553B']) # px.colors.qualitative.Plotly figP = px.scatter(x=d1, y=d2, color=target.Project, hover_name=target.ID, hover_data={'condition': target.Condition, 'case_id': target.CaseID}, template="simple_white", color_discrete_sequence=colors2) if method == 'pca': figC.update_layout(title=str(method).upper() + ' - ' + title_dataset + ' dataset', xaxis_title="PC1 - explained variance: " + str(round(evr1 * 100, 2)), yaxis_title="PC2 - explained variance: " + str(round(evr2 * 100, 2)), legend_title="Condition") # , colorway= ['#636EFA', '#EF553B']) # blue, red figP.update_layout(title=str(method).upper() + ' - ' + title_dataset + ' dataset', xaxis_title="PC1 - explained variance: " + str(round(evr1 * 100, 2)), yaxis_title="PC2 - explained variance: " + str(round(evr2 * 100, 2)), legend_title="Project", colorway=colors2) else: figC.update_layout(title=str(method).upper() + ' - ' + title_dataset + ' dataset', xaxis_title="Dim1", yaxis_title="Dim2", legend_title="Condition") # colorway=px.colors.qualitative.Plotly# ['#636EFA', '#EF553B']) # blue, red figP.update_layout(title=str(method).upper() + ' - ' + title_dataset + ' dataset', xaxis_title="Dim1", yaxis_title="Dim2", legend_title="Project", colorway=colors2) # control location of legend with yanchor="top", y=0.99, xanchor="left", x=0.01, # orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, figC.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, font=dict(size=20))) figP.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, font=dict(size=20))) figC.update_traces(marker={'size': 20}) figC.show() figP.update_traces(marker={'size': 20}) figP.show() # export to html if not os.path.exists(outpath): os.mkdir(outpath) figC.write_html(outpath + str(method) + '_' + title_dataset.lower() + "_condition.html") figP.write_html(outpath + str(method) + '_' + title_dataset.lower() + "_project.html") # reduce marker sizes for pdf pictures figC.update_traces(marker={'size': 10}) figP.update_traces(marker={'size': 10}) figC.update_xaxes(linewidth=3, title_font_size=20, tickfont_size=20) figC.update_yaxes(linewidth=3, title_font_size=20, tickfont_size=20) figP.update_xaxes(linewidth=3, title_font_size=20, tickfont_size=20) figC.update_yaxes(linewidth=3, title_font_size=20, tickfont_size=20) figC.write_image(outpath + str(method) + '_' + title_dataset.lower() + "_condition.pdf", scale=3) figP.write_image(outpath + str(method) + '_' + title_dataset.lower() + "_project.pdf", scale=3)
[docs]def comparison_dim_reduction(X, target, outpath): """Main code from scikit-learn to create comparative plots with different reduction methods :param X: data :param target: list :param outpath: to store image .png """ # Create figure fig = plt.figure(figsize=(15, 5)) # uncomment to plot title # fig.suptitle("Dimensionality reduction with ", fontsize=14) n_components = 2 # Set-up manifold methods methods = OrderedDict() methods['PCA'] = PCA(n_components=n_components, random_state=0) # init='pca' methods['t-SNE'] = TSNE(n_components=n_components, random_state=0) methods['UMAP'] = umap.UMAP(random_state=42) # Plot results for i, (label, method) in enumerate(methods.items()): t0 = time() Y = method.fit_transform(X) t1 = time() print("%s: %.2g sec" % (label, t1 - t0)) ax = fig.add_subplot(1, 3, 1 + i) ax.scatter(Y[:, 0], Y[:, 1], c=target) # , cmap=plt.cm.Spectral), cmap=plt.cm.Set2 # ax.set_title("%s (%.2g sec)" % (label, t1 - t0)) ax.set_title("%s" % label, fontsize=16, fontweight='bold') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) if label == 'PCA': evr1 = method.explained_variance_ratio_[0] evr2 = method.explained_variance_ratio_[1] ax.set_xlabel("PC1 - " + str(round(evr1 * 100, 2)) + "%", fontsize=20) ax.set_ylabel("PC2 - " + str(round(evr2 * 100, 2)) + "%", fontsize=20) else: ax.set_xlabel('Dim1', fontsize=20) ax.set_ylabel('Dim2', fontsize=20) ax.tick_params(axis='both', which='major', labelsize=20) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) ax.axis('tight') if not os.path.exists(outpath): os.mkdir(outpath) plt.savefig(outpath + "comparison_plot.pdf", format='pdf')
[docs]def plotly_comparison(data, pcolor_list, ccolor_list, annos, outpath, title_dataset): """Plot all dimensionality reduction techniques into one plotly plot as HTML and PDF :param data: numpy array :param pcolor_list: list of project colors :param ccolor_list: list of condition colors :param annos: annotation dataframe: FileID, CaseID, SampleType, Project :param outpath: for images :param title_dataset: for output path :return: None """ pca = PCA(n_components=2) X_pca = pca.fit_transform(data) d1 = X_pca[:, 0] d2 = X_pca[:, 1] evr1 = pca.explained_variance_ratio_[0] evr2 = pca.explained_variance_ratio_[1] tsne = TSNE(n_components=2, random_state=0, perplexity=20) # , perplexity=20 X_embedded = tsne.fit_transform(data) t1 = X_embedded[:, 0] t2 = X_embedded[:, 1] reducer = umap.UMAP(random_state=42, n_neighbors=30, min_dist=0.0) # , n_neighbors=30, min_dist=0.0 embedding = reducer.fit_transform(data) u1 = embedding[:, 0] u2 = embedding[:, 1] fig = make_subplots(rows=2, cols=3, subplot_titles=("PCA", "t-SNE", "UMAP")) fig.add_trace(go.Scatter(x=d1, y=d2, mode='markers', marker=dict(color=pcolor_list)), row=1, col=1) fig.add_trace(go.Scatter(x=t1, y=t2, mode='markers', marker=dict(color=pcolor_list)), row=1, col=2) fig.add_trace(go.Scatter(x=u1, y=u2, mode='markers', marker=dict(color=pcolor_list)), row=1, col=3) fig.update_traces(hovertext='ID: ' + annos.ID + '<br>' + 'Project: ' + annos.Project + '<br>' + 'Condition: ' + annos.Condition + '<br>' + 'CaseID: ' + annos.CaseID) # fig.update_traces(marker_color=target) fig.add_trace(go.Scatter(x=d1, y=d2, mode='markers', marker=dict(color=ccolor_list)), row=2, col=1) fig.add_trace(go.Scatter(x=t1, y=t2, mode='markers', marker=dict(color=ccolor_list)), row=2, col=2) fig.add_trace(go.Scatter(x=u1, y=u2, mode='markers', marker=dict(color=ccolor_list)), row=2, col=3) fig.update_traces(hovertext='ID: ' + annos.ID + '<br>' + 'Project: ' + annos.Project + '<br>' + 'Condition: ' + annos.Condition + '<br>' + 'CaseID: ' + annos.CaseID) # fig.update_traces(marker_color=conditions) fig.update_layout(template="simple_white", width=2000, height=1200, showlegend=False, title_text='Dimensionality Reduction Comparison') fig.show() if not os.path.exists(outpath): os.mkdir(outpath) fig.write_html(outpath + "comparison_" + title_dataset.lower() + ".html") fig.write_image( outpath + "comparison_" + title_dataset.lower() + ".pdf") # plotly-orca not working --> now using kaleido
[docs]def silhouette_plot(data, target): """Determine good number of clusters :param data: numpy array :param target: list """ fig, ax = plt.subplots() ax.set_title("The silhouette plot") ax.set_xlabel("The silhouette coefficient values") ax.set_ylabel("Cluster label") # The subplot is the silhouette plot ax.set_xlim([-0.1, 1]) lower = 10 clusters = ['normal', 'tumor'] target = np.array(target) # UMAP reducer = umap.UMAP(random_state=10) embedding = reducer.fit_transform(data) for i, condition in enumerate(clusters): # Compute the silhouette score average silhouette_avg = silhouette_score(embedding, target) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(embedding, target) # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = sample_silhouette_values[target == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] upper = lower + size_cluster_i color = cm.nipy_spectral(float(i) / 2) # plotting the silhouette clusters ax.fill_betweenx(np.arange(lower, upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax.text(-0.05, lower + 0.5 * size_cluster_i, condition) # shift by 10 to plot space between silhouettes lower = upper + 10 # 10 for the 0 samples # The vertical line for average silhouette score of all the values ax.axvline(x=silhouette_avg, color="red", linestyle="--") ax.set_yticks([]) # Clear the yaxis labels / ticks ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) fig.show()
if __name__ == "__main__": main()