Source code for metadata_processing

""" This script serves to process metadata from TCGA, ICGC, SRA and outputs a standardized table
    Metadata Table [sample_id, case_id, condition, bio_project]

    usage:
    python metadata_processing.py -i <path-to-fileEndpt-json> ... -s <path-to-csv-files> -t <path-to-json>
    -t <path-to-json> ... -o <outpath.csv>
"""

# imports
import os
import click
import os.path
import sys
import re
import json
from dictor import dictor
import pandas as pd
import fileUtils.file_handling as fh


@click.command()
@click.option('-i', '--icgc', prompt='paths to ICGC json metadata files FILE Endpoint',
              help='Path to folder with ICGC metadata in json format', multiple=True)
@click.option('-s', '--sra', prompt='paths to SRA csv metadata files',
              help='Path to folder with SRA metadata in csv format')
@click.option('-t', '--tcga', prompt='paths to TCGA json metadata files',
              help='Path to folder with TCGA metadata in json format', multiple=True)
@click.option('-o', '--outpath', prompt='output path for storing metadata table',
              help='Table with metadata in csv format [Sample_ID, Case_ID, Condition, Project]',
              required=True)
def main(icgc, sra, tcga, outpath):
    dfs = list()

    # ICGC (several folders can be given)
    for folder in icgc:
        # print(folder)
        icgc_files = fh.get_files(folder, '*.json')
        icgc_metadata = parse_icgc_json_files(icgc_files)
        df = pd.DataFrame.from_dict(icgc_metadata, orient='index',
                                    columns=['CaseID', 'SampleType', 'Project'])

        df.reset_index(level=0, inplace=True)
        df.rename({'index': 'FileID'}, axis=1, inplace=True)
        dfs.append(df)

    # SRA (assumed to be in one folder)
    sra_files = fh.get_files(sra, '*.csv')
    sra_metadata = fh.parse_csv(sra_files)
    df_sra = pd.DataFrame.from_dict(sra_metadata, orient='index',
                                    columns=['CaseID', 'SampleType', 'Project'])

    df_sra.reset_index(level=0, inplace=True)
    df_sra.rename({'index': 'File_ID'}, axis=1, inplace=True)
    dfs.append(df_sra)

    # TCGA (several folders can be given)
    for folder in tcga:
        # print(folder)
        tcga_files = fh.get_files(folder, '*.json')
        tcga_metadata = parse_tcga_json_files(tcga_files)
        df = pd.DataFrame.from_dict(tcga_metadata, orient='index',
                                    columns=['CaseID', 'SampleType', 'Project'])

        df.reset_index(level=0, inplace=True)
        df.rename({'index': 'FileID'}, axis=1, inplace=True)
        dfs.append(df)

    # Make Dataframe
    frame = pd.concat(dfs, axis=0, ignore_index=True)
    frame.to_csv(outpath, index=False)


[docs]def parse_tcga_json_files(files): """Read json files from list and extract relevant values :param files: as list :return: tcga_metadata: dict """ tcga_metadata = {} normal_pattern = re.compile('normal', re.IGNORECASE) for i, file in enumerate(files): file = file.split('/')[-1] with open(file) as json_file: data = json.load(json_file) file_id = os.path.basename(file).split(".")[0] case_id = dictor(data, "data.cases.0.case_id") submitter_id = dictor(data, "data.submitter_id") # = file_name sample_type = dictor(data, "data.cases.0.samples.0.sample_type") project = dictor(data, "data.cases.0.project.project_id") if bool(re.search(normal_pattern, sample_type)): sample_type = 'normal' else: sample_type = 'tumor' if submitter_id not in tcga_metadata.keys(): tcga_metadata[submitter_id] = [case_id, sample_type, project] return tcga_metadata
[docs]def parse_icgc_json_files(files): """Read json files from list and extract relevant values :param files: as list :return: icgc_metadata: dict """ icgc_metadata = {} normal_pattern = re.compile('normal', re.IGNORECASE) for file in files: print(file) file = file.split('/')[-1] with open(file) as json_file: data = json.load(json_file) file_id = os.path.basename(file).split(".")[0] file_name = os.path.basename(file).split(".")[1] donor_id = dictor(data, "donors.0.donorId") specimen_type = dictor(data, "donors.0.specimenType.0") project = dictor(data, "donors.0.projectCode") if bool(re.search(normal_pattern, specimen_type)): specimen_type = 'normal' else: specimen_type = 'tumor' if file_name not in icgc_metadata.keys(): icgc_metadata[file_name] = [donor_id, specimen_type, project] return icgc_metadata
if __name__ == "__main__": main()