""" This script can train different machine learning models performs grid search and evaluation of models
"""
# Bash wrapper script to run machine_learning_tool.py in ml_docker_container
#
# declare -a classifiers=("LinearSVC" "SVC" "RandomForest" "MultiLayerPerceptron" )
#
# for val in ${classifiers[@]}; do \
# echo Starting to train $val classifier \
# python app/machine_learning_tool.py \
# -i <INPATH.TXT> \
# -m <METADATA.CSV> \
# -a $val \
# -o <DEST-DIR> \
# -t <TITLE> \
# done
# imports logging
import click
import logging
from time import time
import os
# imports data handling
import pandas as pd
import numpy as np
import fileUtils.metadata_handling as mh
# imports scikit-learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold
# scores
from sklearn.metrics import accuracy_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.metrics import classification_report, make_scorer, matthews_corrcoef
# classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
# feature selection
from sklearn.feature_selection import VarianceThreshold
# plotting
import umap
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
# Create logger
logger = logging.getLogger('Machine Learning Tool')
# Create console handler
ch = logging.StreamHandler()
# Create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
logger.setLevel(logging.INFO)
@click.command()
@click.option('-i', '--inpath', prompt='path to gene_count table',
help='path to gene_count table', required=True)
@click.option('-m', '--metadata', prompt='path to metadata table',
help='Table with metadata in csv format', required=True)
@click.option('-a', '--algorithm', prompt='please enter a machine learning algorithm',
help='please enter a machine learning algorithm', required=True,
type=click.Choice(['LinearSVC', 'SVC', 'RandomForest', 'MultiLayerPerceptron'], case_sensitive=True))
@click.option('-k', '--kegg', help='provide path to file with KEGG genes of interest',
required=False, default=None)
@click.option('-o', '--outpath', prompt='path for outputs',
help='Path for output tables and plots. Extension is filled automatically.')
@click.option('-c', '--cores', help='please enter number of cpu cores to be used for search', default=2)
@click.option('-t', '--title', help='please enter a dataset title/tissue title')
def main(inpath, metadata, algorithm, kegg, outpath, title, cores):
start_time = time()
# Make sure output directory exists
new_outpath = outpath + 'ml_results/'
if not os.path.exists(new_outpath):
os.mkdir(new_outpath)
logger.info('Parse gene_expression table...')
if kegg:
logger.info('Applying KEGG filter...')
kegg_genes_pathways = pd.read_csv(kegg, sep="\t")
kegg_ensembl = kegg_genes_pathways.iloc[:, 0].tolist()
kegg_name = kegg_genes_pathways.iloc[:, 1].tolist()
id2name_dict = dict(zip(kegg_ensembl, kegg_name))
expr_df, sample_names, gene_ids, feature_names = read_dataset(inpath, ensembl_filter=kegg_ensembl,
id2name=id2name_dict)
# No filtering for KEGG genes
else:
expr_df, sample_names, gene_ids, feature_names = read_dataset(inpath, ensembl_filter=None,
id2name=None)
logger.info('Parse metadata...')
meta_dict, target_names, target, annotations = mh.read_metadata(metadata, sample_names)
logger.info('Split training and testing data...')
X_train, X_test, y_train, y_test = train_test_split(expr_df, target, stratify=target, test_size=0.2,
random_state=99)
logger.info('Feature scaling and filtering out 0-variance genes...')
X_train_f, X_test_f, f_names_red, gene_ids_red = feature_reduction(X_train, X_test, feature_names)
# Define models
# pass randomState Instance to RandomForest upon instantiation (following best practices sklearn)
models = {'LinearSVC': LinearSVC(random_state=37), 'SVC': SVC(random_state=42),
'RandomForest': RandomForestClassifier(random_state=np.random.RandomState(0)),
'MultiLayerPerceptron': MLPClassifier(random_state=87)}
logger.info('Performing grid search...')
grid_search(model=algorithm, model_dict=models, X_train=X_train_f, y_train=y_train, X_test=X_test_f, y_test=y_test,
gene_ids=gene_ids_red, feature_names=f_names_red, splits=5, n_jobs=cores, refitting=True,
outpath=new_outpath, dataset_title=title)
end_time = time()
logger.info('Process finished in ' + str(round(end_time - start_time, 2)) + "sec")
[docs]def read_dataset(path, ensembl_filter, id2name):
""" Function to parse a dataset
:param path: to input gene-expression table (GeneID, GeneName, Sample1, Sample2, ...)
:param ensembl_filter: input list of ensembl_gene_ids
:param names: input list of gene_names
:return: pandas.Dataframe
:return: sample_names, list
:return: gene_ids = features, list
:return: gene_names = feature_names, list
"""
data = pd.read_csv(path, sep="\t")
df = data.T
feature_names = df.iloc[1].tolist()
df = df.drop(["GeneName"], axis=0)
df.columns = df.iloc[0]
df = df.drop(["GeneID"], axis=0)
df.columns.name = None
sample_names = df.index.tolist()
gene_ids = df.columns.tolist()
df_final = df.fillna(0)
if ensembl_filter is not None:
to_select = df.columns[df.columns.isin(ensembl_filter)]
filtered_df = df[to_select].copy()
copy = filtered_df.copy()
df_final = copy.fillna(0)
gene_ids = df_final.columns.tolist()
feature_names = [v for e in gene_ids for k, v in id2name.items() if e == k]
return df_final, sample_names, gene_ids, feature_names
[docs]def feature_reduction(X_train, X_test, featureNames):
""" Function to filter out 0-variance genes
:param X_train: numpy.array of training data
:param X_test: numpy.array of testing data
:param featureNames: array with gene feature names
:return: filtered and scaled X_train, numpy array
:return: filtered and scaled X_test, numpy array
:return: filtered feature_names, list
:return: filtered gene_ids, list
"""
scaler = MinMaxScaler()
var_filter = VarianceThreshold(threshold=0)
# get gene_ids for reducing after variance threshold
gene_ids = np.array(X_train.columns.tolist())
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)
X_train_filtered = var_filter.fit_transform(X_train_sc)
X_test_filtered = var_filter.transform(X_test_sc)
# get names of reduced feature_names
mask = var_filter.get_support()
feature_arr = np.array(featureNames)
feature_names_reduced = feature_arr[mask].tolist()
gene_ids_red = gene_ids[mask].tolist()
# print INFO
print("{}: {}".format("Number of training instances ", X_train_filtered.shape[0]))
print("{}: {}".format("Number of test instances ", X_test_filtered.shape[0]))
print("{}: {}".format("Number of features after VarianceFilter ", len(feature_names_reduced)))
print("{}: {}".format(len(featureNames) - len(feature_names_reduced), "features were removed "))
return X_train_filtered, X_test_filtered, feature_names_reduced, gene_ids_red
[docs]def max_features_arr(feature_names, max_f_arr):
""" Function for obtaining max_features as float array for RandomForest
:param feature_names: array of gene feature names
:param max_f_arr: array of max_features to be converted into float array for grid search
:return: float array of max_features
"""
n_features_ = len(feature_names)
float_arr = [i / n_features_ for i in max_f_arr]
return float_arr
[docs]def search_space(model):
"""Function to define the search space for GridSearch or RandomGridSearch
:param model: one of LinearSVC, SVC, RandomForest, MultiLayerPerceptron
:return: search space
"""
space = {}
if model == 'LinearSVC':
# combinations = 12 * 3 = 36
space = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 40, 60, 80, 100],
'max_iter': [10000, 50000, 100000]}
if model == 'SVC':
# combinations = 12 * 6 * 3 * 3 = 648
space = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 40, 60, 80, 100],
'gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
'kernel': ['sigmoid', 'poly', 'rbf'],
'max_iter': [10000, 50000, 100000]}
if model == 'RandomForest':
# combinations = 6 * 2 * 32 * 2 * 4 * 2 = 6144
space = {'n_estimators': [10, 100, 200, 400, 600, 1000],
'max_features': ['auto', 'sqrt'],
'max_depth': np.linspace(1, 32, 32, endpoint=True),
'min_samples_leaf': [3, 5],
'min_samples_split': [6, 8, 10, 12],
'bootstrap': [True, False]}
if model == 'MultiLayerPerceptron':
# combinations = 3 * 4 * 2 * 4 * 3 * 2 + 1 = 577
space = {'hidden_layer_sizes': [(1000, 500), (500, 200), (100, 50)],
'activation': ['tanh', 'relu', 'identity'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.001, 0.01, 0.1],
'max_iter': [10000, 50000, 100000],
'learning_rate': ['constant', 'adaptive'],
'early_stopping': [True]}
return space
[docs]def grid_search(model, model_dict, X_train, y_train, X_test, y_test, gene_ids, feature_names, splits, n_jobs, refitting,
outpath, dataset_title):
"""Function to perform grid search and print evaluation performance
:param model: LinearSVC, SVC, RandomForest, MultiLayerPerceptron
:param model_dict: dictionary which instantiates models
:param X_train: scaled training data, numpy array
:param y_train: list of target values
:param X_test: scaled test data, numpy array
:param y_test: list of target test values
:param gene_ids: list of gene ids
:param feature_names: list of features
:param splits: number of splits for cross-validation
:param n_jobs: number of cpu-cores to be used
:param refitting: default is True
:param outpath: for plots and tables
:param dataset_title: i.e. tissue_dataset
:return: None
"""
# set display options
pd.set_option("display.max_rows", 20, "display.max_columns", 15)
# construct output path
new_outpath = outpath + model + '_' + dataset_title
# Select classifier from dict
clf = model_dict[model]
# Select grid
grid = search_space(model)
print("Performing random search for: ", clf)
t0 = time()
# Define scoring metrics
scoring = {'mcc': make_scorer(matthews_corrcoef), 'bal_acc': 'balanced_accuracy', 'rec': 'recall',
'prec': 'precision'}
Matthews = make_scorer(matthews_corrcoef, greater_is_better=True)
# Instantiate stratified-shuffled 5x2 cross validation: shuffling important when class labels contiguously in data
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=65)
outer_cv = StratifiedKFold(n_splits=splits, shuffle=True, random_state=65)
# Define GridSearch
search = GridSearchCV(clf, grid, cv=inner_cv, return_train_score=True, scoring=Matthews, n_jobs=n_jobs,
refit=refitting) # when refit = True, fitting once to get best_model is sufficient
# Performing non-nested grid search and hyper-parameter tuning
search.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best params: ", search.best_params_)
print("Best estimator: ", search.best_estimator_)
best_model = search.best_estimator_
# Cross_val_scores of trees
cv_results = search.cv_results_
cv_df = pd.DataFrame.from_dict(cv_results)
print('Cross-Validation GridSearch ' + model + '\n', cv_df)
# print(cv_results)
print_mean_std_scores(cv_results)
# Outer CV loop (5x2 cross validation) with different scores
nested_score = cross_validate(best_model, X_train, y_train, cv=outer_cv, scoring=scoring)
print("Outer cross validation scores on train data: ")
print_mean_std_scores(nested_score)
if model == 'RandomForest':
# Extract feature importance of forest
forest_feature_importance = best_model.feature_importances_
df_forest_feature_importance = pd.DataFrame(sorted(zip(forest_feature_importance, gene_ids, feature_names),
reverse=True),
columns=['Feature_Importance', 'GeneID', 'GeneName'])
plot_bar(df_forest_feature_importance, new_outpath)
print("20 most important features RandomForest \n", df_forest_feature_importance.head(n=20))
# Save feature_importance to file
df_forest_feature_importance.to_csv(new_outpath + '_feature_importance.csv', index=False)
# Predict on test data
y_pred = best_model.predict(X_test)
# Evaluate predictions
print(model, ' : ')
print(classification_report(y_test, y_pred, target_names=['normal', 'tumor']))
print(model + " MCC on test data %.3f" % (matthews_corrcoef(y_test, y_pred)))
print("Balanced Accuracy on test data %.3f" % (balanced_accuracy_score(y_test, y_pred)))
# Evaluate model
fig = plot_confusion_matrix(best_model, X_test, y_test, display_labels=('control', 'tumor'), cmap=plt.cm.Blues)
fig.ax_.set_title('Confusion Matrix - ' + model)
plt.savefig(new_outpath + '_confusion_matrix.png')
plot_roc_curve(best_model, X_test, y_test, name=model)
plt.savefig(new_outpath + '_roc_auc.png')
[docs]def print_mean_std_scores(scoring_dict):
"""Print cross_validation_results
:param scoring_dict: from grid_search, cross_val_score, cross_validate
:return: None
"""
for k, v in scoring_dict.items():
# parameters cannot be used for computing mean and std
if k.startswith('param'):
continue
else:
print("%s: mean=%.3f, (+/- std=%.3f)" % (k, v.mean(), v.std()))
[docs]def plot_bar(df, outpath):
"""Plot 10 most important features
:param df: pandas Dataframe of feature importances
:return: None
"""
colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
# set the style of the axes and the text color
plt.rcParams['axes.edgecolor'] = '#333F4B'
plt.rcParams['axes.linewidth'] = 0.8
plt.rcParams['xtick.color'] = '#333F4B'
plt.rcParams['ytick.color'] = '#333F4B'
plt.rcParams['text.color'] = '#333F4B'
names = df.iloc[:, 2].tolist()[:10]
percentages = df.iloc[:, 0].tolist()[:10]
fig, ax = plt.subplots(figsize=(7, 5))
plt.bar(names, percentages, width=0.5, color=colors['gray'], edgecolor=colors['black'])
plt.xticks(rotation=45, ha='right')
ax.set_ylabel('Feature Importance', fontsize=12)
ax.set_xlabel('Genes', fontsize=12)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.savefig(outpath + '_feature_importance_plot.png')
if __name__ == "__main__":
main()