#!/usr/bin/python import os, sys, re from optparse import OptionParser import pdb import PopulateRefSeq sys.path.insert(0, '../.') from MyException import MyException ### EXPLANATION ### # Populates a database of genes from Cufflinks data. # These genes are screened to guarantee the following: # a valid status in the Cufflinks file (status == 'OK') # a status of "Reviewed" or "Validated" or "Provisional" in refseqStatus SQL table. # # This populates each of the following SQL tables: # H1genes, HOSgenes, JurkatGenes, ENCODE1genes, ENCODE2genes # It is possible for the same gene to appear in any or all of these tables. # Each gene name (as in human-readable name) is only allowed once in each table. # # There are several problems with redundant cross-referencing in the cufflinks output. # This is due to redundancy in the refGene SQL table. There are 3 possible scenarios: # # 1) Multiple refseq IDs point to the same gene name (RUFY3) # In this case, genes.fpkm_tracking has one record, but the coordinates given for the transcript # do NOT match any record in refGene. Instead, cufflinks takes the smallest start value and the # largest end value for all transcripts. # TO SOLVE: # a) as per normal records, select refGene.name and refGene.strand based on chrom, txStart, txEnd and name2 # b) when no result is returned, select name, txStart, txEnd based on name2 # c) find the largest transcript (NOTE:this choice is arbitrary) # d) use this record's refseq ID (refGene.name) # e) use the fpkm values from the genes.fpkm_tracking record (even though this is not # the value for this specific transcript) # # 2) One refseq ID has multiple records in refGene (FAM21B) # In this case, genes.fpkm_tracking has as many records as there are duplicates in refGene. # TO SOLVE: # a) use the genes.fpkm_tracking record with the highest FPKM value # b) to break a tie, use the first listed record (or whatever is programatically easiest) # # 3) Combination of problems (1) and (2); Multiple refseq IDs point to the same gene name AND # some of these refseq IDs are duplicated in refGene (DAZ4) # TO SOLVE: # a) take record with highest FPKM, as in (2) above # b) query for refseq ID (refGene.name) based on name2, txStart, txEnd # c) if more than one record is returned # d) store TSSID from genes.fpkm_tracking record (index of 5 with zero-based indexing) # e) find the record in isoforms.fpkm_tracking that has this TSSID, genename, chrom, txStart and txStop # f) if there is more than one record that corresponds to this, chose the highest FPKM # g) use the tracking_id (refseq ID) in this record # h) this tracking_id may need to have a '_n' suffix removed (where n is a one or more digits) # # 4) Special case of (1) where all isoforms have the same txStart in refGene # In this case, there is only one refGene record, since the largest transcript class CufflinksGenes: chromList = ('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM') ## acceptableSampleNameValues = ['h1', 'hos', 'jurkat', 'encode1', 'encode2'] ## genesFilePaths = ['/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/H1/genes.fpkm_tracking', ## '/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/HOS/genes.fpkm_tracking', ## '/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/Jurkat/genes.fpkm_tracking', ## '/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/ENCODE/rep1/genes.fpkm_tracking', ## '/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/ENCODE/rep2/genes.fpkm_tracking'] ## isoformsFilePaths = ['/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/H1/isoforms.fpkm_tracking', ## '/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/HOS/isoforms.fpkm_tracking', ## '/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/Jurkat/isoforms.fpkm_tracking', ## '/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/ENCODE/rep1/isoforms.fpkm_tracking', ## '/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/ENCODE/rep2/isoforms.fpkm_tracking'] ## fileIndices = {'h1':0, 'hos':1, 'jurkat':2, 'encode1':3, 'encode2':4} ## def __init__(self, sampleName): def __init__(self, genesDir, tableName): if not os.path.exists(genesDir): s = 'ERROR - file does not exist: {0}'.format(genesDir) raise MyException(s, 'CufflinksGenes.init()', True) self.genesDir = genesDir self.tableName = tableName.lower() print 'WARNING - this program assumes the SQL table {0} is EMPTY'.format(self.tableName) answer = raw_input('is this true? y/[n]: ') if not ('y' == answer.lower() or 'yes' == answer.lower()): sys.exit(1) self.refseq = PopulateRefSeq.PopulateRefSeq() self.geneDetail = {} self.isoforms = {} self.logFile = open('./logs/CufflinksGenes.log', 'w') return def __del__(self): self.logFile.close() ### read_isoforms_file ### # 0 1 2 3 4 5 6 7 8 9 10 11 12 # tracking_id class_code nearest_ref_id gene_id gene_short_name tss_id locus length coverage FPKM FPKM_conf_lo FPKM_conf_hi FPKM_status def build_isoforms_hash(self): ## filePath = self.isoformsFilePaths[self.fileIndices[self.sampleName]] filePath = os.path.join(self.genesDir, 'isoforms.fpkm_tracking') fh = open(filePath, 'r') fh.readline() # throw out descriptive text header line = fh.readline() while line: columns = line.rstrip().split('\t') m = re.search('(N._\d*)', columns[0]) refseqID = m.group(1) geneName = columns[3] tssID = columns[5] (chrom, start, stop) = self.parse_locus(columns[6]) fpkm = columns[9] isoformInfo = (refseqID, geneName, chrom, start, stop, fpkm) # use a tuple; data shouldn't change if tssID in self.isoforms.keys(): self.isoforms[tssID].append(isoformInfo) else: self.isoforms[tssID] = [isoformInfo] line = fh.readline() fh.close() return ### parse_gene_file ### # This method reads the file line by line and processes each line. # Any strange "data referencing" scenarios, as discussed in the header comments, are dealt with here. def parse_gene_file(self): ## filePath = self.genesFilePaths[self.fileIndices[self.sampleName]] filePath = os.path.join(self.genesDir, 'genes.fpkm_tracking') fh = open(filePath, 'r') fh.readline() # throw out descriptive header line line = fh.readline() while line: parsedDetails = self.parse_gene_details(line) if parsedDetails: (geneName, chrom, start, stop, fpkm, fpkmLo, fpkmHi, tssID) = parsedDetails refseqResult = self.fetch_refseq_ID(geneName, chrom, start, stop) if not refseqResult: # Scenario (1) self.logFile.write('(1) {0}'.format(line)) longestIsoform = self.find_longest_transcript(geneName) if not longestIsoform: line = fh.readline() continue # there is no record of this gene name in refGene table (geneID, strand, start, stop) = longestIsoform geneInfo = (geneID, chrom, strand, start, stop, fpkm, fpkmLo, fpkmHi) self.add_record_to_gene_hash(geneName, geneInfo) elif len(refseqResult) > 1: # Scenario (3) self.logFile.write('(3) {0}'.format(line)) geneID = self.find_matching_isoform(tssID, geneName, chrom, start, stop) if not geneID: raise MyException('ERROR - no result returned from find_matching_isoform()\n' + line, 'CufflinksGenes.parse_gene_file()', True) strand = None for row in refseqResult: (refseqID, currentStrand) = row if refseqID == geneID: strand = currentStrand if None == strand: s = 'ERROR - isoforms refseq ID {0} does not match refseq IDs associated with name {1}:\n\t'.format(geneID, geneName) for row in refseqResult: s += '{0} '.format(row[0]) s += '\n' self.logFile.write(s) line = fh.readline() continue geneInfo = (geneID, chrom, strand, start, stop, fpkm, fpkmLo, fpkmHi) self.add_record_to_gene_hash(geneName, geneInfo) else: # normal cases and Scenario (2) row = refseqResult[0] (refseqID, strand) = row geneInfo = (refseqID, chrom, strand, start, stop, fpkm, fpkmLo, fpkmHi) self.add_record_to_gene_hash(geneName, geneInfo) line = fh.readline() fh.close() return ### add_record_to_gene_hash ### # This method adds a record to self.geneDetail if it is unique to that hash key # or if the FPKM value is higher than the currently stored record. # In this way, we implicitly take care of Scenario (2) and the second half of Scenario (3) # The only problem with filtering by FPKM here (near the end of the pipeline) is that every record # in Scenario (3) is treated as being valid. Therefore, more work is done than is necessary. # However, I feel the program logic flows much better, this way. def add_record_to_gene_hash(self, geneName, geneInfo): if geneName in self.geneDetail.keys(): self.logFile.write('dup {0} {1}\t{2}\n'.format(geneName, geneInfo, self.geneDetail[geneName])) if geneInfo[4] > self.geneDetail[geneName][4]: # compare FPKM values self.geneDetail[geneName] = geneInfo else: self.geneDetail[geneName] = geneInfo return ### parse_gene_details ### # This method examines a single line of the genes file. # Record format is as follows: # 0 1 2 3 4 5 6 7 8 9 10 11 12 # tracking_id class_code nearest_ref_id gene_id gene_short_name tss_id locus length coverage FPKM FPKM_conf_lo FPKM_conf_hi FPKM_status # # Each value is separated out and some values are used for quality checking on the record # If the record is worth keeping, the following values are returned, in this order: # gene name (tracking_id) # chromosome (part of locus) # transcription start (part of locus) # transcription stop (part of locus) # fpkm (FPKM) # 95% confidence interval range, low, then high (FPKM_conf_lo, FPKM_conf_hi) # FPKM_status is NOT RETURNED, it is only used to determine if the record should be processed any further # If the record is not worth keeping, 'None' is returned def parse_gene_details(self, line): # break the line into individual values columns = line.rstrip().split('\t') # Parse record into individual values we can use geneName = columns[0] (chrom, start, stop) = self.parse_locus(columns[6]) fpkm = float(columns[9]) fpkmLo = float(columns[10]) fpkmHi = float(columns[11]) tssID = columns[5] recordStatus = columns[12] # don't process records with untrustworthy data if 'OK' != recordStatus: return None # don't process records that reside on strange chromosomes if chrom not in self.chromList: return None # place a minimum floor on all FPKM values if self.fpkm_below_minimum(fpkm): fpkm = 0 if self.fpkm_below_minimum(fpkmLo): fpkmLo = 0 if self.fpkm_below_minimum(fpkmHi): fpkmHi = 0 return (geneName, chrom, start, stop, fpkm, fpkmLo, fpkmHi, tssID) ### parse_locus ### def parse_locus(self, locus): m = re.search('(chr.*):(\d*)-(\d*)', locus) chrom = m.group(1) start = m.group(2) stop = m.group(3) return (chrom, start, stop) ### fpkm_below_minimum ### # Change very low fpkm values to zero, for easier storage # This is just a cludge to prevent PostGreSQL from attempting to enter values that are too for it to handle def fpkm_below_minimum(self, value): if value < 0.000001: # 1E-6 return True else: return False ### fetch_refseq_ID ### # This is the first method we call to find the refseq ID to associate with a particular gene. def fetch_refseq_ID(self, geneName, chrom, start, stop): ## for geneName in self.geneDetail.keys(): ## (geneID, chrom, start, stop, fpkm, fpkmLo, fpkmHi) = self.geneDetail[geneName] qStr = """ SELECT name, strand FROM refGene WHERE name2 = '{0}' AND chrom = '{1}' AND txStart = {2} AND txEnd = {3} ;""".format(geneName, chrom, start, stop) result = self.refseq.general_query(qStr) return result ### find_longest_transcript ### def find_longest_transcript(self, geneName): maxName = None maxStrand = None maxStart = maxEnd = 0 qStr = """ SELECT name, strand, txStart, txEnd FROM refGene WHERE name2 = '{0}' ;""".format(geneName) result = self.refseq.general_query(qStr) if not result: self.logFile.write('unknown {0}\n'.format(geneName)) return None # no record has refGene.name2 set to the passed in value of geneName for row in result: (name, strand, txStart, txEnd) = row if abs(txEnd - txStart) > abs(maxEnd - maxStart): maxName = name maxStrand = strand maxStart = txStart maxEnd = txEnd return (maxName, maxStrand, maxStart, maxEnd) ### find_matching_isoform ### # For a given set of tssIDs, return the refseq ID with the highest fpkm value def find_matching_isoform(self, tssID, genesName, genesChrom, genesStart, genesStop): maxFPKM = 0 highestFPKMgeneID = None for tss in tssID.split(','): # this field can contain several tssIDs, separated by ',' for isoformInfo in self.isoforms[tss]: (refseqID, isoformName, isoformChrom, isoformStart, isoformStop, isoformFPKM) = isoformInfo if isoformChrom == genesChrom and isoformStart == genesStart and isoformStop == genesStop and isoformName == genesName: if isoformFPKM > maxFPKM: maxFPKM = isoformFPKM highestFPKMgeneID = refseqID return highestFPKMgeneID ### populate_table ### def populate_table(self): ## tableName = self.sampleName + 'gene' # postgresql table names are case-insensitive for geneName in self.geneDetail.keys(): (geneID, chrom, strand, start, stop, fpkm, fpkmLo, fpkmHi) = self.geneDetail[geneName] geneInfo = (geneName, geneID, chrom, strand, start, stop, fpkm, fpkmLo, fpkmHi) self.refseq.insert_into_gene_table(self.tableName, geneInfo ) ### read_parameters ### def read_parameters(): parser = OptionParser() ## parser.add_option('-n', '--sampleName', dest='sampleName', \ ## help='sample name of the RNA-seq experiment (ie: H1, HOS, Jurkat, ENCODE1, ENCODE2)') parser.add_option('-g', '--genesDir', dest='genesDir', help='directory containing cufflinks results') parser.add_option('-t', '--tableName', dest='tableName', help='SQL table to populate') (parms, args) = parser.parse_args() if None == parms.genesDir or None == parms.tableName: print 'WARNING - both \"genes file\" and \"table name\" must be specified' parser.print_help() return None return parms ### main ### if __name__ == '__main__': opts = read_parameters() if opts: try: print 'instantiating class' x = CufflinksGenes(opts.genesDir, opts.tableName) print 'indexing isoforms' x.build_isoforms_hash() print 'indexing genes' x.parse_gene_file() print 'populating table' x.populate_table() except MyException as exc: print exc.get_method() print exc.get_msg()