Source code for Darts_DNN.Darts_tx2g

# -*- coding: UTF-8 -*-

"""
Darts_DNN - tx2g

Mapping kallisto transcript TPM to Gene

TODO: 
	parse a GTF file for mapping IDs in the future
"""

import sys
import os
from collections import defaultdict
from . import config

[docs]def read_t2g(): g2t = {} t2g = {} rsem_dict = {} firstline=True with open(config.T2G_FILE_PATH, 'r') as fin: for line in fin: #for line in s.split('\n'): ele = line.rstrip().split() if firstline: header = {ele[x]:x for x in range(len(ele))} firstline = False continue gid = ele[header['gene_id']].split('.')[0] if not gid.startswith('ENSG'): continue tids = ele[header['transcript_id(s)']].split(',') tids = [x.split('.')[0] for x in tids] #rsem_dict[gid] = float(ele[header['TPM']]) g2t[gid] = tids for tid in tids: t2g[tid] = gid return rsem_dict, t2g
[docs]def read_kallisto(fn, t2g): kal_dict = defaultdict(float) with open(fn, 'r') as fin: firstline=True for line in fin: ele = line.rstrip().split() if firstline: header = {ele[x]:x for x in range(len(ele))} firstline=False continue tid = ele[header['target_id']].split('.')[0] tpm = float(ele[header['tpm']]) kal_dict[t2g[tid]] += tpm return kal_dict
[docs]def write_out_comparison(rsem_dict, kal_dict, fn): with open(fn, 'w') as fout: fout.write('gene_id\trsem\tkallisto\n') for g in kal_dict: fout.write('%s\t%f\t%f\n'%(g, rsem_dict[g], kal_dict[g])) return
[docs]def write_out_quant(kal_dict, fn): with open(fn, 'w') as fout: fout.write('gene_id\tTPM\n') for g in kal_dict: fout.write('%s\t%f\n'%(g, kal_dict[g])) return
[docs]def kallisto2rsem(kal_dir, t2g): kal_fn = os.path.join(kal_dir, 'abundance.tsv') kal_dict = read_kallisto(kal_fn, t2g) kal_outfn = os.path.join(kal_dir, 'gene_tpm.tsv') write_out_quant(kal_dict, kal_outfn) return
[docs]def parser(kal_dir): rsem_dict, t2g = read_t2g() kallisto2rsem(kal_dir, t2g)