Source code for fileUtils.metadata_handling

from collections import OrderedDict
import pandas as pd


[docs]def read_metadata(metadata, sample_ids): """Read in metadata file csv format FileID, SampleType, CaseID, Project :return: meta_dict :return: target_names: dict :return: target: array :return: annotation: pd.DataFrame :return: project_arr :return: color_list """ # Read metadata into dictionary meta_dict = {} with open(metadata, 'r') as file: for line in file.readlines()[1:]: fields = line.strip().split(",") file_id = fields[0].strip() case = fields[1].strip() sample_type = fields[2].strip() project = fields[3].strip() if file_id not in meta_dict: meta_dict[file_id] = [case, sample_type, project] # Map sample_ids to conditions to create condition array (str) target_names = {} for i, e in enumerate(sample_ids): if e not in target_names.keys(): target_names[e] = meta_dict[e][1] # Array of conditions encoded as numbers target = [0 if value == 'normal' else 1 for key, value in target_names.items()] # Sort the metadata sorted_meta = OrderedDict([(el, meta_dict[el]) for el in target_names]) tmp = {0 + i: [k, sorted_meta.get(k)] for i, k in enumerate(sorted_meta)} # Create annotation dataframe from sorted dictionary (ID, CaseID, Condition, Project) annos = pd.DataFrame.from_dict(tmp, orient='index') tags = annos.iloc[:, 1].apply(pd.Series) tags.rename({0: 'CaseID', 1: 'Condition', 2: 'Project'}, axis=1, inplace=True) annotations = pd.concat([annos[:], tags[:]], axis=1) annotations.drop([1], axis=1, inplace=True) annotations.rename({0: 'ID'}, axis=1, inplace=True) return meta_dict, target_names, target, annotations