Source code for parseFeatureCounts

# This is a script to parse multiple files from featureCounts to perform batch correction

import click
import logging
import time
import sys
import pandas as pd
from pathlib import Path
import fileUtils.file_handling as fh

# Create logger
logger = logging.getLogger('FeatureCounts table creator')
# Create console handler
ch = logging.StreamHandler()
# Create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
logger.setLevel(logging.INFO)


@click.command()
@click.option('-i', '--inpath', prompt='path to count Files',
              help='Path to folder with featureCount values',
              required=True)
@click.option('-g', '--genes', prompt='gene ids from rnaseq',
              help='Path to extracted genes from rnaseq file', )
@click.option('-o', '--outpath', prompt='output file',
              help='Output file; Table containing featureCount values',
              required=True)
@click.option('--lengths/--no-lengths', prompt='extract lengths of genes',
              help='writes table with gene lengths if true, default=False',
              default=False)
def main(inpath, genes, outpath, lengths):
    print("Starting...")
    start_time = time.time()

    # printed to STDOUT on command line
    logger.info('Get featureCount input files')
    allfiles, sample_ids = fh.get_files_and_sample_ids(inpath, '*.txt')

    logger.info('Parse featureCount files')
    gene_counts, gene_dict, gene_lengths = parse_featureCounts(allfiles, genes)

    logger.info('Write count values to table')
    table = fh.write_table(gene_dict, sample_ids, gene_counts, outpath)

    if lengths:
        write_lengths(gene_lengths, outpath)

    end_time = time.time()
    logger.info('Process finished in ' + str(round(end_time - start_time, 2)) + "sec")


[docs]def parse_featureCounts(files, genes): """Read a file and the sample ID and save gene_ids and tmp_values :param files: list of files :param genes: dict mapping gene_ids to gene_names :return: counts_all_files list of lists :return: gene_dict of unique genes :return: geneLengths_all_files dict mapping ids to lengths """ gene_dict = fh.read_unique_genes(genes) # initialize dict with keys from gene_dict and empty list as value --> where the tmp values are to be stored counts_all_files = {key: [] for key in gene_dict.keys()} geneLengths_all_files = {key: [] for key in gene_dict.keys()} for file in files: file = file.split('/')[-1] # Read each file with open(file, 'r') as f: count_dict_one_file = {} # read file from 2nd line for line in f.readlines()[2:]: # split lines by tab fields = line.strip().split("\t") # take "Gene Id" and "Gene Name" current_gene_id = fields[0] current_length = fields[5] # read count at last column current_count = fields[-1] # Append gene_id, tmp_value to temporary dict for 1 file if current_gene_id not in count_dict_one_file.keys(): count_dict_one_file[current_gene_id] = int(current_count) # save exon gene lengths: they are identical for all featureCounts outputs geneLengths_all_files[current_gene_id] = current_length else: count_dict_one_file[current_gene_id] += int(current_count) # Append values of 1 file to tmp_values list comprising all files for gene_id, tpm_value in count_dict_one_file.items(): counts_all_files[gene_id].append(tpm_value) return counts_all_files, gene_dict, geneLengths_all_files
[docs]def write_lengths(geneLengths_all_files, outpath): """Create mini-table with gene-lengths from RNAseq pipeline from FeatureCounts output :param geneLengths_all_files: gene_ids mapped to gene_lengths :param outpath: path where to store outfile, txt """ # Output comprises all non-overlapping bases in exons belonging to the same gene table = pd.DataFrame.from_dict(geneLengths_all_files, orient='index', columns=['GeneLength']) table.reset_index(level=0, inplace=True) table.rename({'index': 'GeneID'}, axis=1, inplace=True) p = Path(outpath) ext = p.suffix new_outpath = p.rename(Path(p.parent, "featureCounts_geneLengths" + ext)) # print(new_outpath) table.to_csv(new_outpath, index=False)
if __name__ == "__main__": main()