Source code for icgc_metadata_processing

# This script processes clinical metadata from ICGC from FILE and DONOR ENDPOINT in JSON

# imports
import click
import os.path
import sys
import logging
import time
import json
from dictor import dictor
import pandas as pd
import fileUtils.file_handling as fh

# Create logger
logger = logging.getLogger('Json parser')
# Create console handler
ch = logging.StreamHandler()
# Create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
logger.setLevel(logging.INFO)


@click.command()
@click.option('-f', '--file_endpt', prompt='path to folder with ICGC FILE metadata',
              help='Path to folder with ICGC FILE metadata',
              required=True)
@click.option('-d', '--donor_endpt', prompt='path to folder with ICGC DONOR metadata',
              help='Path to folder with ICGC DONOR metadata',
              required=True)
@click.option('-o', '--outpath', prompt='output table',
              help='Table with metadata in csv format',
              required=True)
def main(file_endpt, donor_endpt, outpath):
    start_time = time.time()

    # printed to STDOUT on command line
    logger.info('Get json files FILE endpoint')
    allfiles = fh.get_files(file_endpt, '*.json')

    logger.info('Get json files DONOR endpoint')
    donorfiles = fh.get_files(donor_endpt, '*.json')

    logger.info('Parse json files')
    metadata_allfiles = parse_icgc_json_files(allfiles, donorfiles)

    logger.info('Write metadata to table')
    table = write_icgc_table(outpath, metadata_allfiles)

    end_time = time.time()
    logger.info('Process finished in ' + str(round(end_time - start_time, 2)) + "sec")


[docs]def parse_icgc_json_files(files, donors): """Read json files from file & donor endpoint and extract relevant values :param files: list of paths to json_files, file endpoint :param donors: list of paths to json_files, donor endpoint :return: metadata dictionary """ metadata = {} specimen_ids = [] # Parse FILE endpoint info store in metadata dict for f in files: # split absolute path to get basename file = f.split('/')[-1] with open(f) as json_file: data = json.load(json_file) file_id = os.path.basename(file).split(".")[0] file_name = os.path.basename(file).split(".")[1] donor_id = dictor(data, "donors.0.donorId") specimen_type = dictor(data, "donors.0.specimenType.0") project = dictor(data, "donors.0.projectCode") sample_id = dictor(data, "donors.0.sampleId.0") specimen_id = dictor(data, "donors.0.specimenId.0") print(file_id, file_name, donor_id, specimen_type, specimen_id, sample_id) specimen_ids.append(specimen_id) if file_id not in metadata.keys(): # PACA_CA needs , specimen_type metadata[file_id] = [file_name, donor_id, project, sample_id, specimen_id, specimen_type] # Parse DONOR endpoint info and append to metadata dict for d in donors: donor = d.split('/')[-1] with open(donor) as json_file: data = json.load(json_file) file_id = os.path.basename(donor).split(".")[0] gender = dictor(data, "gender") vital_status = dictor(data, "vitalStatus") age_at_index = dictor(data, "ageAtDiagnosis") specimen = dictor(data, "specimen") tumor_subtype = dictor(data, "tumourSubtype") primary_site = dictor(data, "primarySite") primary_diagnosis = dictor(data, "tumourType") survival_time = dictor(data, "survivalTime") tumor_stage = dictor(data, "tumourStageAtDiagnosis") diagnosisIcd10 = dictor(data, "diagnosisIcd10") for k, v in metadata.items(): if k == file_id: v.append(primary_site) v.append(primary_diagnosis) v.append(tumor_subtype) v.append(gender) v.append(vital_status) v.append(age_at_index) v.append(survival_time) v.append(tumor_stage) v.append(diagnosisIcd10) return metadata
[docs]def write_icgc_table(outpath, metadata): """Write metadata to table :param outpath: to store output table .csv :param metadata: dictionary :return: metadata table """ # PACA_CA needs 'Specimen_Type' table = pd.DataFrame.from_dict(metadata, orient='index', columns=['File_name', 'Donor_ID', 'Project', 'Sample_ID', 'Specimen_ID', 'Specimen_type', 'Primary_site', 'Primary_diagnosis', 'Tumor_subtype', 'Gender', 'Vital_status', 'Age_at_index', 'Survival_time', 'Tumor_stage', 'Icd10']) table.reset_index(level=0, inplace=True) table.rename({'index': 'File_ID'}, axis=1, inplace=True) table.to_csv(outpath, index=False) return table
if __name__ == "__main__": sys.exit(main())