Source code for machine_learning_tool

""" This script can train different machine learning models performs grid search and evaluation of models

"""


# Bash wrapper script to run machine_learning_tool.py in ml_docker_container
#
#     declare -a classifiers=("LinearSVC" "SVC" "RandomForest" "MultiLayerPerceptron" )
#
#     for val in ${classifiers[@]}; do \
#         echo Starting to train $val classifier \
#         python app/machine_learning_tool.py \
#         -i  <INPATH.TXT> \
#         -m <METADATA.CSV> \
#         -a $val \
#         -o <DEST-DIR>  \
#         -t <TITLE> \
#     done

# imports logging
import click
import logging
from time import time
import os

# imports data handling
import pandas as pd
import numpy as np
import fileUtils.metadata_handling as mh

# imports scikit-learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold

# scores
from sklearn.metrics import accuracy_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.metrics import classification_report, make_scorer, matthews_corrcoef

# classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

# feature selection
from sklearn.feature_selection import VarianceThreshold

# plotting
import umap
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors

# Create logger
logger = logging.getLogger('Machine Learning Tool')
# Create console handler
ch = logging.StreamHandler()
# Create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
logger.setLevel(logging.INFO)


@click.command()
@click.option('-i', '--inpath', prompt='path to gene_count table',
              help='path to gene_count table', required=True)
@click.option('-m', '--metadata', prompt='path to metadata table',
              help='Table with metadata in csv format', required=True)
@click.option('-a', '--algorithm', prompt='please enter a machine learning algorithm',
              help='please enter a machine learning algorithm', required=True,
              type=click.Choice(['LinearSVC', 'SVC', 'RandomForest', 'MultiLayerPerceptron'], case_sensitive=True))
@click.option('-k', '--kegg', help='provide path to file with KEGG genes of interest',
              required=False, default=None)
@click.option('-o', '--outpath', prompt='path for outputs',
              help='Path for output tables and plots. Extension is filled automatically.')
@click.option('-c', '--cores', help='please enter number of cpu cores to be used for search', default=2)
@click.option('-t', '--title', help='please enter a dataset title/tissue title')
def main(inpath, metadata, algorithm, kegg, outpath, title, cores):
    start_time = time()

    # Make sure output directory exists
    new_outpath = outpath + 'ml_results/'
    if not os.path.exists(new_outpath):
        os.mkdir(new_outpath)

    logger.info('Parse gene_expression table...')
    if kegg:
        logger.info('Applying KEGG filter...')
        kegg_genes_pathways = pd.read_csv(kegg, sep="\t")
        kegg_ensembl = kegg_genes_pathways.iloc[:, 0].tolist()
        kegg_name = kegg_genes_pathways.iloc[:, 1].tolist()
        id2name_dict = dict(zip(kegg_ensembl, kegg_name))
        expr_df, sample_names, gene_ids, feature_names = read_dataset(inpath, ensembl_filter=kegg_ensembl,
                                                                      id2name=id2name_dict)
    # No filtering for KEGG genes
    else:
        expr_df, sample_names, gene_ids, feature_names = read_dataset(inpath, ensembl_filter=None,
                                                                      id2name=None)

    logger.info('Parse metadata...')
    meta_dict, target_names, target, annotations = mh.read_metadata(metadata, sample_names)

    logger.info('Split training and testing data...')
    X_train, X_test, y_train, y_test = train_test_split(expr_df, target, stratify=target, test_size=0.2,
                                                        random_state=99)
    logger.info('Feature scaling and filtering out 0-variance genes...')
    X_train_f, X_test_f, f_names_red, gene_ids_red = feature_reduction(X_train, X_test, feature_names)

    # Define models
    # pass randomState Instance to RandomForest upon instantiation (following best practices sklearn)
    models = {'LinearSVC': LinearSVC(random_state=37), 'SVC': SVC(random_state=42),
              'RandomForest': RandomForestClassifier(random_state=np.random.RandomState(0)),
              'MultiLayerPerceptron': MLPClassifier(random_state=87)}

    logger.info('Performing grid search...')
    grid_search(model=algorithm, model_dict=models, X_train=X_train_f, y_train=y_train, X_test=X_test_f, y_test=y_test,
                gene_ids=gene_ids_red, feature_names=f_names_red, splits=5, n_jobs=cores, refitting=True,
                outpath=new_outpath, dataset_title=title)

    end_time = time()
    logger.info('Process finished in ' + str(round(end_time - start_time, 2)) + "sec")


[docs]def read_dataset(path, ensembl_filter, id2name): """ Function to parse a dataset :param path: to input gene-expression table (GeneID, GeneName, Sample1, Sample2, ...) :param ensembl_filter: input list of ensembl_gene_ids :param names: input list of gene_names :return: pandas.Dataframe :return: sample_names, list :return: gene_ids = features, list :return: gene_names = feature_names, list """ data = pd.read_csv(path, sep="\t") df = data.T feature_names = df.iloc[1].tolist() df = df.drop(["GeneName"], axis=0) df.columns = df.iloc[0] df = df.drop(["GeneID"], axis=0) df.columns.name = None sample_names = df.index.tolist() gene_ids = df.columns.tolist() df_final = df.fillna(0) if ensembl_filter is not None: to_select = df.columns[df.columns.isin(ensembl_filter)] filtered_df = df[to_select].copy() copy = filtered_df.copy() df_final = copy.fillna(0) gene_ids = df_final.columns.tolist() feature_names = [v for e in gene_ids for k, v in id2name.items() if e == k] return df_final, sample_names, gene_ids, feature_names
[docs]def feature_reduction(X_train, X_test, featureNames): """ Function to filter out 0-variance genes :param X_train: numpy.array of training data :param X_test: numpy.array of testing data :param featureNames: array with gene feature names :return: filtered and scaled X_train, numpy array :return: filtered and scaled X_test, numpy array :return: filtered feature_names, list :return: filtered gene_ids, list """ scaler = MinMaxScaler() var_filter = VarianceThreshold(threshold=0) # get gene_ids for reducing after variance threshold gene_ids = np.array(X_train.columns.tolist()) X_train_sc = scaler.fit_transform(X_train) X_test_sc = scaler.transform(X_test) X_train_filtered = var_filter.fit_transform(X_train_sc) X_test_filtered = var_filter.transform(X_test_sc) # get names of reduced feature_names mask = var_filter.get_support() feature_arr = np.array(featureNames) feature_names_reduced = feature_arr[mask].tolist() gene_ids_red = gene_ids[mask].tolist() # print INFO print("{}: {}".format("Number of training instances ", X_train_filtered.shape[0])) print("{}: {}".format("Number of test instances ", X_test_filtered.shape[0])) print("{}: {}".format("Number of features after VarianceFilter ", len(feature_names_reduced))) print("{}: {}".format(len(featureNames) - len(feature_names_reduced), "features were removed ")) return X_train_filtered, X_test_filtered, feature_names_reduced, gene_ids_red
[docs]def max_features_arr(feature_names, max_f_arr): """ Function for obtaining max_features as float array for RandomForest :param feature_names: array of gene feature names :param max_f_arr: array of max_features to be converted into float array for grid search :return: float array of max_features """ n_features_ = len(feature_names) float_arr = [i / n_features_ for i in max_f_arr] return float_arr
[docs]def search_space(model): """Function to define the search space for GridSearch or RandomGridSearch :param model: one of LinearSVC, SVC, RandomForest, MultiLayerPerceptron :return: search space """ space = {} if model == 'LinearSVC': # combinations = 12 * 3 = 36 space = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 40, 60, 80, 100], 'max_iter': [10000, 50000, 100000]} if model == 'SVC': # combinations = 12 * 6 * 3 * 3 = 648 space = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 40, 60, 80, 100], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0], 'kernel': ['sigmoid', 'poly', 'rbf'], 'max_iter': [10000, 50000, 100000]} if model == 'RandomForest': # combinations = 6 * 2 * 32 * 2 * 4 * 2 = 6144 space = {'n_estimators': [10, 100, 200, 400, 600, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': np.linspace(1, 32, 32, endpoint=True), 'min_samples_leaf': [3, 5], 'min_samples_split': [6, 8, 10, 12], 'bootstrap': [True, False]} if model == 'MultiLayerPerceptron': # combinations = 3 * 4 * 2 * 4 * 3 * 2 + 1 = 577 space = {'hidden_layer_sizes': [(1000, 500), (500, 200), (100, 50)], 'activation': ['tanh', 'relu', 'identity'], 'solver': ['sgd', 'adam'], 'alpha': [0.0001, 0.001, 0.01, 0.1], 'max_iter': [10000, 50000, 100000], 'learning_rate': ['constant', 'adaptive'], 'early_stopping': [True]} return space
[docs]def plot_bar(df, outpath): """Plot 10 most important features :param df: pandas Dataframe of feature importances :return: None """ colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS) # set the style of the axes and the text color plt.rcParams['axes.edgecolor'] = '#333F4B' plt.rcParams['axes.linewidth'] = 0.8 plt.rcParams['xtick.color'] = '#333F4B' plt.rcParams['ytick.color'] = '#333F4B' plt.rcParams['text.color'] = '#333F4B' names = df.iloc[:, 2].tolist()[:10] percentages = df.iloc[:, 0].tolist()[:10] fig, ax = plt.subplots(figsize=(7, 5)) plt.bar(names, percentages, width=0.5, color=colors['gray'], edgecolor=colors['black']) plt.xticks(rotation=45, ha='right') ax.set_ylabel('Feature Importance', fontsize=12) ax.set_xlabel('Genes', fontsize=12) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) plt.tight_layout() plt.savefig(outpath + '_feature_importance_plot.png')
if __name__ == "__main__": main()