Source code for extract_gene_ids2name

# Purpose of this script is to read all genes that can be found by the nf-core/rnaseq pipeline
# into a list with Ensembl IDs and gene names
# genes.gtf available at https://ewels.github.io/AWS-iGenomes/

# imports
import re


[docs]def lines(genes): """Generator for lines in gtf file :param genes: .gtf file """ with open(genes, 'r') as g: for line in g: yield parse(line)
[docs]def parse(line): """Parse ;-sep line of gtf file :param line: of gtf file :return: gene_dict """ gene_dict = {} # split by tab and only save last column with attribute list info_list = line.strip().split("\t")[-1] # info format: gene_name "DDX11L1"; gene_id "ENS...."; attributes = [x for x in re.split(";", info_list) if x.strip()] for i, attribute in enumerate(attributes): # split by whitespace --> key = gene_name : value = "DDX11L1" key, value = attribute.split() # strip trailing characters key = key.strip() # strip of " value = value.strip('"') gene_dict[key] = value return gene_dict
[docs]def create_dict(path): """Save unique gene ids and names to dict :param path: to gtf file :return: gene_dict mapping gene_ids to gene_names """ id_name_dict = {} # iterate through generator object gene_dict = lines(path) for line in gene_dict: # value of 'gene_id' becomes key in new dict key = line.get('gene_id') value = line.get('gene_name') if key not in id_name_dict: # dict[key] = value # 'ENSG00000132781' : 'MUTYH' id_name_dict[key] = value return id_name_dict
[docs]def write_genes(path, outfile): """Write genes2names into table :param path: to genes.gtf :param outfile: path to output file .txt """ gene_id_name_dict = create_dict(path) with open(outfile, 'w') as file: for key, value in gene_id_name_dict.items(): file.write("{0}{1}{2}{3}".format(key, "\t", value, "\n"))
if __name__ == "__main__": infile = "genes.gtf" write_genes(infile, 'genes_id2name.txt')