Source code for merge_replica

# Script to merge stringTieFPKM TPM or featureCounts raw count values
#    outputs:    - merged_replica_table
#                - merged_gene_table
#                - new_metadata with new ids
#                - new_metadata with batch information encoded as INT


# imports
import click
import logging
import time
import pandas as pd

# Create logger
logger = logging.getLogger('Replica merger')
# Create console handler
ch = logging.StreamHandler()
# Create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
logger.setLevel(logging.INFO)


@click.command()
@click.option('-i', '--inpath', prompt='input path table with replica', 
                help='input path to merged TPM/featureCounts table with replica',
                required=True)
@click.option('-m', '--metadata', prompt='path to metadata files',
                help='Path to table in csv format',
                required=True)
@click.option('-o', '--outpath', prompt='output table',
                help='Output table with median merged replica (TXT). Extension filled automatically',
                required=True)
def main(inpath, metadata, outpath):
    start_time = time.time()

    logger.info('Reading metadata...')
    meta_table, meta_dict = read_metadata(metadata)

    logger.info('Writing output to table...')
    merge_replica(inpath, meta_table, outpath)

    end_time = time.time()
    logger.info('Process finished in ' + str(round(end_time - start_time, 2)) + "sec")


[docs]def read_metadata(meta): """ Function to read metadata table and connect case_id to file_id Format: FileID, CaseID, Sample_Type, Project :param: meta: inpath to metadata table :return: metadata: as dataFrame :return: dictionary: mapping case_id to sample_ids """ metadata = pd.read_csv(meta, sep=",") file_id = metadata.iloc[:, 0] case_id = metadata.iloc[:, 2] id_dict = dict(zip(file_id, case_id)) # dict mapping {BioSample: (SRR1, SRR2), ...} new_dict = {} for k, v in id_dict.items(): if v not in new_dict.keys(): new_dict[v] = [k] else: new_dict[v].append(k) return metadata, new_dict
[docs]def merge_replica(inpath, meta_table, outpath): """ Function to merge replica with same BioSample = Case_ID and condition :param: inpath: path to TPM/featureCounts merged table :param: meta_table: metadata table, dataFrame :param: outpath: path where to store merged_file, csv :return: merged_replica_file.csv """ pd.set_option("display.max_rows", 20, "display.max_columns", 10) # read tpm values df = pd.read_csv(inpath, sep="\t") gene_name = list(df.iloc[:, 1]) # print(gene_name) # gene_ids = df.index df = df.drop(['GeneName'], axis=1) # transpose df dft = df.T # = sorter sample_ids = dft.index[1:].tolist() new_header = dft.iloc[0] dft = dft[1:] dft.columns = new_header # sort metadata according to sample_ids meta_table.FileID = meta_table.FileID.astype("category") meta_table.FileID.cat.set_categories(sample_ids, inplace=True) meta_table = meta_table.sort_values('FileID') # attach info to merge replica on file_id = list(meta_table.iloc[:, 0]) case_id = list(meta_table.iloc[:, 1]) sample_type = list(meta_table.iloc[:, 2]) project = list(meta_table.iloc[:, 3]) p_dict = dict(zip(case_id, project)) mapping = zip(sample_ids, zip(file_id, case_id)) dft.insert(0, 'CaseID', case_id) dft.insert(1, 'SampleType', sample_type) dft.reset_index(drop=True, inplace=True) dft = dft.rename_axis(None, axis=1) # remove index name dft.insert(0, 'FileID', file_id) # make Multi-Index for merging replica based on case_id and sample_type dft.set_index(['FileID', 'CaseID', 'SampleType'], inplace=True) # convert object type numbers to numeric dft = dft.apply(pd.to_numeric) # merge replica based sample_id and condition df_final = dft.groupby(['CaseID', 'SampleType'], axis=0).median().reset_index() # attach condition info as numbers condition_dict = {'normal': 0, 'tumor': 1} df_final.insert(2, 'Condition', df_final['SampleType'].map(condition_dict)) # attach project info df_final.insert(3, 'Project', df_final['CaseID'].map(p_dict)) # make new unique ids unique_ids = ['id'+str(i) for i in range(len(df_final))] df_final.insert(4, 'UniqueID', unique_ids) df_final.fillna(0, inplace=True) outpath_suffix = outpath + '.txt' df_final.to_csv(outpath_suffix, sep="\t", index=False) # make dataframe look like before for clustering + batch correction df_new = df_final.drop(['CaseID', 'SampleType', 'Condition', 'Project'], axis=1) dft_new = df_new.T dft_new.drop('UniqueID', inplace=True) dft_new.index.set_names('GeneID', inplace=True) dft_new.columns = unique_ids dft_new.insert(0, 'GeneName', gene_name) dft_new.to_csv(outpath+"_gene_table.txt", sep="\t") # create new metadata table with new ids new_case_ids = list(df_final.CaseID) new_sample_types = list(df_final.SampleType) new_project = list(df_final.Project) metadata_new = pd.DataFrame({'FileID': unique_ids, 'CaseID': new_case_ids, 'SampleType': new_sample_types, 'Project': new_project}) metadata_new.to_csv(outpath + "_metadata_new.csv", index=False) # annotate metadata with Batches project_series = pd.Series(metadata_new.Project) projects_set = set(metadata_new.Project) batches = [i for i in range(1, len(projects_set) + 1)] project_batch_dict = dict(zip(projects_set, batches)) metadata_new['Batch'] = project_series.map(project_batch_dict) metadata_new.to_csv(outpath + "_metadata_new_batches.csv", sep=';', index=False) # print info print("number of samples gene_count_table ", len(dft_new.columns[2:])) print("number of samples in metadata table ", len(metadata_new)) print("No. Normal samples: ", len(metadata_new[metadata_new.SampleType == 'normal'])) print("No. Tumor samples: ", len(metadata_new[metadata_new.SampleType == 'tumor']))
if __name__ == "__main__": main()