# This is a script to parse multiple files from featureCounts to perform batch correction
import click
import logging
import time
import sys
import pandas as pd
from pathlib import Path
import fileUtils.file_handling as fh
# Create logger
logger = logging.getLogger('FeatureCounts table creator')
# Create console handler
ch = logging.StreamHandler()
# Create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
logger.setLevel(logging.INFO)
@click.command()
@click.option('-i', '--inpath', prompt='path to count Files',
help='Path to folder with featureCount values',
required=True)
@click.option('-g', '--genes', prompt='gene ids from rnaseq',
help='Path to extracted genes from rnaseq file', )
@click.option('-o', '--outpath', prompt='output file',
help='Output file; Table containing featureCount values',
required=True)
@click.option('--lengths/--no-lengths', prompt='extract lengths of genes',
help='writes table with gene lengths if true, default=False',
default=False)
def main(inpath, genes, outpath, lengths):
print("Starting...")
start_time = time.time()
# printed to STDOUT on command line
logger.info('Get featureCount input files')
allfiles, sample_ids = fh.get_files_and_sample_ids(inpath, '*.txt')
logger.info('Parse featureCount files')
gene_counts, gene_dict, gene_lengths = parse_featureCounts(allfiles, genes)
logger.info('Write count values to table')
table = fh.write_table(gene_dict, sample_ids, gene_counts, outpath)
if lengths:
write_lengths(gene_lengths, outpath)
end_time = time.time()
logger.info('Process finished in ' + str(round(end_time - start_time, 2)) + "sec")
[docs]def parse_featureCounts(files, genes):
"""Read a file and the sample ID and save gene_ids and tmp_values
:param files: list of files
:param genes: dict mapping gene_ids to gene_names
:return: counts_all_files list of lists
:return: gene_dict of unique genes
:return: geneLengths_all_files dict mapping ids to lengths
"""
gene_dict = fh.read_unique_genes(genes)
# initialize dict with keys from gene_dict and empty list as value --> where the tmp values are to be stored
counts_all_files = {key: [] for key in gene_dict.keys()}
geneLengths_all_files = {key: [] for key in gene_dict.keys()}
for file in files:
file = file.split('/')[-1]
# Read each file
with open(file, 'r') as f:
count_dict_one_file = {}
# read file from 2nd line
for line in f.readlines()[2:]:
# split lines by tab
fields = line.strip().split("\t")
# take "Gene Id" and "Gene Name"
current_gene_id = fields[0]
current_length = fields[5]
# read count at last column
current_count = fields[-1]
# Append gene_id, tmp_value to temporary dict for 1 file
if current_gene_id not in count_dict_one_file.keys():
count_dict_one_file[current_gene_id] = int(current_count)
# save exon gene lengths: they are identical for all featureCounts outputs
geneLengths_all_files[current_gene_id] = current_length
else:
count_dict_one_file[current_gene_id] += int(current_count)
# Append values of 1 file to tmp_values list comprising all files
for gene_id, tpm_value in count_dict_one_file.items():
counts_all_files[gene_id].append(tpm_value)
return counts_all_files, gene_dict, geneLengths_all_files
[docs]def write_lengths(geneLengths_all_files, outpath):
"""Create mini-table with gene-lengths from RNAseq pipeline from FeatureCounts output
:param geneLengths_all_files: gene_ids mapped to gene_lengths
:param outpath: path where to store outfile, txt
"""
# Output comprises all non-overlapping bases in exons belonging to the same gene
table = pd.DataFrame.from_dict(geneLengths_all_files, orient='index', columns=['GeneLength'])
table.reset_index(level=0, inplace=True)
table.rename({'index': 'GeneID'}, axis=1, inplace=True)
p = Path(outpath)
ext = p.suffix
new_outpath = p.rename(Path(p.parent, "featureCounts_geneLengths" + ext))
# print(new_outpath)
table.to_csv(new_outpath, index=False)
if __name__ == "__main__":
main()