# Purpose of this script is to read all genes that can be found by the nf-core/rnaseq pipeline
# into a list with Ensembl IDs and gene names
# genes.gtf available at https://ewels.github.io/AWS-iGenomes/
# imports
import re
[docs]def lines(genes):
"""Generator for lines in gtf file
:param genes: .gtf file
"""
with open(genes, 'r') as g:
for line in g:
yield parse(line)
[docs]def parse(line):
"""Parse ;-sep line of gtf file
:param line: of gtf file
:return: gene_dict
"""
gene_dict = {}
# split by tab and only save last column with attribute list
info_list = line.strip().split("\t")[-1]
# info format: gene_name "DDX11L1"; gene_id "ENS....";
attributes = [x for x in re.split(";", info_list) if x.strip()]
for i, attribute in enumerate(attributes):
# split by whitespace --> key = gene_name : value = "DDX11L1"
key, value = attribute.split()
# strip trailing characters
key = key.strip()
# strip of "
value = value.strip('"')
gene_dict[key] = value
return gene_dict
[docs]def create_dict(path):
"""Save unique gene ids and names to dict
:param path: to gtf file
:return: gene_dict mapping gene_ids to gene_names
"""
id_name_dict = {}
# iterate through generator object
gene_dict = lines(path)
for line in gene_dict:
# value of 'gene_id' becomes key in new dict
key = line.get('gene_id')
value = line.get('gene_name')
if key not in id_name_dict:
# dict[key] = value
# 'ENSG00000132781' : 'MUTYH'
id_name_dict[key] = value
return id_name_dict
[docs]def write_genes(path, outfile):
"""Write genes2names into table
:param path: to genes.gtf
:param outfile: path to output file .txt
"""
gene_id_name_dict = create_dict(path)
with open(outfile, 'w') as file:
for key, value in gene_id_name_dict.items():
file.write("{0}{1}{2}{3}".format(key, "\t", value, "\n"))
if __name__ == "__main__":
infile = "genes.gtf"
write_genes(infile, 'genes_id2name.txt')