Source code for fileUtils.metadata_handling
from collections import OrderedDict
import pandas as pd
[docs]def read_metadata(metadata, sample_ids):
"""Read in metadata file csv format
FileID, SampleType, CaseID, Project
:return: meta_dict
:return: target_names: dict
:return: target: array
:return: annotation: pd.DataFrame
:return: project_arr
:return: color_list
"""
# Read metadata into dictionary
meta_dict = {}
with open(metadata, 'r') as file:
for line in file.readlines()[1:]:
fields = line.strip().split(",")
file_id = fields[0].strip()
case = fields[1].strip()
sample_type = fields[2].strip()
project = fields[3].strip()
if file_id not in meta_dict:
meta_dict[file_id] = [case, sample_type, project]
# Map sample_ids to conditions to create condition array (str)
target_names = {}
for i, e in enumerate(sample_ids):
if e not in target_names.keys():
target_names[e] = meta_dict[e][1]
# Array of conditions encoded as numbers
target = [0 if value == 'normal' else 1 for key, value in target_names.items()]
# Sort the metadata
sorted_meta = OrderedDict([(el, meta_dict[el]) for el in target_names])
tmp = {0 + i: [k, sorted_meta.get(k)] for i, k in enumerate(sorted_meta)}
# Create annotation dataframe from sorted dictionary (ID, CaseID, Condition, Project)
annos = pd.DataFrame.from_dict(tmp, orient='index')
tags = annos.iloc[:, 1].apply(pd.Series)
tags.rename({0: 'CaseID', 1: 'Condition', 2: 'Project'}, axis=1, inplace=True)
annotations = pd.concat([annos[:], tags[:]], axis=1)
annotations.drop([1], axis=1, inplace=True)
annotations.rename({0: 'ID'}, axis=1, inplace=True)
return meta_dict, target_names, target, annotations