Source code for xml_soup

# Script to parse metadata from NCBI SRA for Liver tissue

import os
import click
import logging
from bs4 import BeautifulSoup
import fileUtils.file_handling as fh
import pandas as pd
import re
import time

# Create logger
logger = logging.getLogger('SRA metadata reader')
# Create console handler
ch = logging.StreamHandler()
# Create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
logger.setLevel(logging.INFO)


@click.command()
@click.option('-x', '--xml', prompt='path to xml metadata files',
              help='Path to folder with metadata in xml format',
              required=True)
@click.option('-o', '--outpath', prompt='path for output table ',
              help='Table with rich metadata and short metadata table in csv format.\n'
                   'Extension is filled automatically.',
              required=True)
def main(xml, outpath):
    start_time = time.time()

    logger.info('Get files')
    all_xml_files = fh.get_files(xml, '*.xml')

    logger.info('Parsing xml files...')
    parse_soup(all_xml_files, outpath)

    end_time = time.time()
    logger.info('Process finished in ' + str(round(end_time - start_time, 2)) + "sec")


[docs]def parse_soup(xml_files, outpath): """ Function to parse a folder with xml files from NCBI :param xml_files: path to folder :param outpath: path for metadata.csv sheet """ sample_attributes_dict = {} metadata = {} sample_attributes_list = [] for i, file in enumerate(xml_files): # define & clear all fields for metadata sheet library_name = '' bio_sample = '' subject_id = '' sample_title = '' taxon_id = '' taxon_name = '' age = '' gender = '' # sex, Sex disease = '' # study_disease body_site = '' # body site, body_site source_name = '' histological_type = '' tissue_type = '' is_tumor = '' disease_state = '' # disease status subject_status = '' # subject is affected hbv_infection = '' cirrhosis = '' # primary tumor cirrhosis tumor_stage = '' # staging, stage tumor_grade = '' # total grade, primary tumor grade, cell_type = '' primary_tumor_cirrhosis = '' sample_type = '' clinical_characteristics = '' phenotype = '' tumor_type = '' tumor_status = '' cancer_type = '' diagnosis = '' health_state = '' label = '' description = '' rna_source = '' with open(file, 'r') as f: srr = os.path.basename(file).split(".")[0] soup = BeautifulSoup(f, "lxml-xml") experiment_title = soup.EXPERIMENT.TITLE.text study_ref = soup.EXPERIMENT.STUDY_REF.IDENTIFIERS.PRIMARY_ID.text if soup.DESIGN.LIBRARY_DESCRIPTOR.LIBRARY_NAME is not None: library_name = soup.DESIGN.LIBRARY_DESCRIPTOR.LIBRARY_NAME.text library_strategy = soup.DESIGN.LIBRARY_DESCRIPTOR.LIBRARY_STRATEGY.text library_selection = soup.DESIGN.LIBRARY_DESCRIPTOR.LIBRARY_SELECTION.text if soup.STUDY.IDENTIFIERS.EXTERNAL_ID is not None: bio_sample = soup.STUDY.IDENTIFIERS.EXTERNAL_ID.text if soup.SAMPLE.IDENTIFIERS.EXTERNAL_ID is not None: bio_sample = soup.SAMPLE.IDENTIFIERS.EXTERNAL_ID.text if soup.SAMPLE.TITLE is not None: sample_title = soup.SAMPLE.TITLE.text if soup.SAMPLE.SAMPLE_NAME.TAXON_ID is not None: taxon_id = soup.SAMPLE.SAMPLE_NAME.TAXON_ID.text if soup.SAMPLE.SAMPLE_NAME.SCIENTIFIC_NAME is not None: taxon_name = soup.SAMPLE.SAMPLE_NAME.SCIENTIFIC_NAME.text if soup.SAMPLE.SAMPLE_NAME.COMMON_NAME is not None: taxon_name = soup.SAMPLE.SAMPLE_NAME.COMMON_NAME.text if soup.SAMPLE.SAMPLE_ATTRIBUTES is not None: if soup.SAMPLE.SAMPLE_ATTRIBUTES.SAMPLE_ATTRIBUTE.TAG.text == 'source_name': source_name = soup.SAMPLE.SAMPLE_ATTRIBUTES.SAMPLE_ATTRIBUTE.VALUE.text # Extract all possible sample attribute tags sample_attributes = [(repr(sibling.TAG.text), repr(sibling.VALUE.text)) for sibling in soup.SAMPLE.SAMPLE_ATTRIBUTES.SAMPLE_ATTRIBUTE.next_siblings] # Iterate over all sample attributes for sibling in soup.SAMPLE.SAMPLE_ATTRIBUTES.SAMPLE_ATTRIBUTE.next_siblings: if sibling.TAG.text == 'gap_subject_id': subject_id = sibling.VALUE.text if sibling.TAG.text == 'age' or sibling.TAG.text == 'DONOR_AGE' or \ sibling.TAG.text == 'patient age at resection': age = str("'" + sibling.VALUE.text) if sibling.TAG.text == 'gender' or sibling.TAG.text == 'sex' \ or sibling.TAG.text == 'Sex' or sibling.TAG.text == 'patient sex' \ or sibling.TAG.text == 'DONOR_SEX': gender = sibling.VALUE.text if sibling.TAG.text == 'study disease' or sibling.TAG.text == 'disease': disease = sibling.VALUE.text if sibling.TAG.text == 'body site' or sibling.TAG.text == 'body_site' \ or sibling.TAG.text == 'organism_part' or sibling.TAG.text == 'OrganismPart' \ or sibling.TAG.text == 'source_name': body_site = sibling.VALUE.text if sibling.TAG.text == 'histological type': histological_type = sibling.VALUE.text if sibling.TAG.text == 'tissue type' or sibling.TAG.text == 'tissue type/source' \ or sibling.TAG.text == 'tissue' or sibling.TAG.text == 'tissue source' \ or sibling.TAG.text == 'tissue_type' or sibling.TAG.text == 'TISSUE_TYPE': tissue_type = sibling.VALUE.text if sibling.TAG.text == 'is tumor': is_tumor = sibling.VALUE.text if sibling.TAG.text == 'disease state' or sibling.TAG.text == 'disease status' \ or sibling.TAG.text == 'DISEASE' or sibling.TAG.text == 'infection status': disease_state = sibling.VALUE.text if sibling.TAG.text == 'subject status' or sibling.TAG.text == 'status': subject_status = sibling.VALUE.text if sibling.TAG.text == 'hbv infection': hbv_infection = sibling.VALUE.text if sibling.TAG.text == 'cirrhosis': cirrhosis = sibling.VALUE.text if sibling.TAG.text == 'tumor stage' or sibling.TAG.text == 'staging' \ or sibling.TAG.text == 'disease_stage' or sibling.TAG.text == 'stage' \ or sibling.TAG.text == 'disease stage': tumor_stage = sibling.VALUE.text if sibling.TAG.text == 'total grade' or sibling.TAG.text == 'tumor grade' \ or sibling.TAG.text == 'primary tumor grade': tumor_grade = sibling.VALUE.text if sibling.TAG.text == 'cell type': cell_type = sibling.VALUE.text if sibling.TAG.text == 'primary tumor cirrhosis': primary_tumor_cirrhosis = sibling.VALUE.text if sibling.TAG.text == 'sample_type' or sibling.TAG.text == "BIOMATERIAL_TYPE": sample_type = sibling.VALUE.text if sibling.TAG.text == 'clinical characteristics': clinical_characteristics = sibling.VALUE.text if sibling.TAG.text == 'phenotype' or sibling.TAG.text == 'Phenotype': phenotype = sibling.VALUE.text if sibling.TAG.text == 'tumor type': tumor_type = sibling.VALUE.text if sibling.TAG.text == 'tumor status': tumor_status = sibling.VALUE.text if sibling.TAG.text == 'cancer type': cancer_type = sibling.VALUE.text if sibling.TAG.text == 'diagnosis': diagnosis = sibling.VALUE.text if sibling.TAG.text == 'DONOR_HEALTH_STATUS' or sibling.TAG.text == 'health_state' \ or sibling.TAG.text == 'health state': health_state = sibling.VALUE.text if sibling.TAG.text == 'label': label = sibling.VALUE.text if sibling.TAG.text == 'Description': description = sibling.VALUE.text if sibling.TAG.text == 'rna source': rna_source = sibling.VALUE.text # save list of complete sample attribute possibilities for filtering interesing keywords if repr(sibling.TAG.text) not in sample_attributes_dict.keys(): sample_attributes_dict[repr(sibling.TAG.text)] = i # save each Run accession and associated metadata if srr not in metadata.keys(): metadata[srr] = [experiment_title, study_ref, library_name, library_strategy, library_selection, bio_sample, subject_id, sample_title, taxon_id, taxon_name, age, gender, disease, disease_state, body_site, source_name, histological_type, tissue_type, is_tumor, subject_status, hbv_infection, cirrhosis, tumor_stage, tumor_grade, cell_type, primary_tumor_cirrhosis, sample_type, clinical_characteristics, phenotype, tumor_type, tumor_status, cancer_type, diagnosis, health_state, label, description, rna_source ] # include cause_of_resections normal = re.compile(r"([N|n]on-?\_?[T|t]umor)|([N|n]on-?\_?[C|c]ancer)|(normal)|([H|h]ealthy)|([C|c]ontr?ol)" r"|(\_ant)|(adjacent)|([\_|-]N\d)|(\dN;?)|([N|n]or)|([C|c]trl)|((B|(MC))-?\d+[\_]RNA-Seq)", re.IGNORECASE) tumor = re.compile(r"([T|t]o?umor)|([H|h]cc)|([H|h]epatocellular)|([C|c]ancer)([D|d]isease)|(carcinoma)|" r"(\_T\d)|(\dT;?)|(icc)|(hgdn)|(lgnd)|(p?dc)|(metastas)", re.IGNORECASE) # filter for normal keywords for k, v in metadata.items(): normal_list = [bool(re.search(normal, e)) for e in v] if any(normal_list): metadata[k].append('normal') else: metadata[k].append('') # filter remaining empty values for tumor keywords for k, v in metadata.items(): tumor_list = [bool(re.search(tumor, e)) for e in v] if v[-1] == '' and any(tumor_list): v[-1] = 'tumor' # create dataframe from rich metadata df = pd.DataFrame.from_dict(metadata, orient='index', columns=['Experiment_title', 'Study_ref', 'Library_name', 'Library_strategy', 'Library_selection', 'BioSample', 'Subject_id', 'Sample_title', 'Taxon_id', 'Taxon_name', 'Age', 'Gender', 'Disease', 'Disease_State', 'Body_site', 'Source_name', 'Histological_Type', 'Tissue_type', 'Is_tumor', 'Subject_status', 'HBV-infection', 'Cirrhosis', 'Tumor_stage', 'Tumor_grade', 'Cell_type', 'Primary_tumor_cirrhosis', 'Sample_type', 'Clinical_characteristics', 'Phenotype', 'Tumor_type', 'Tumor_status', 'Cancer_type', 'Diagnosis', 'Health_state', 'Label', 'Description', 'Rna_source', 'Condition' ]) # save dataframe on disk df.reset_index(level=0, inplace=True) df.rename({'index': 'RunID'}, axis=1, inplace=True) df.to_csv(outpath + '.csv', header=True, index=False) # save metadata table additionally as short table metadata_df = df[['RunID', 'BioSample', 'Condition', 'Study_ref']].copy() metadata_df.rename( columns={'RunID': 'FileID', 'BioSample': 'CaseID', 'Condition': 'SampleType', 'Study_ref': 'Project'}, inplace=True) metadata_df.to_csv(outpath + '_short.csv', header=True, index=False)
if __name__ == "__main__": main()