#!/usr/bin/python

import os, sys, re
from optparse import OptionParser
import pdb

import PopulateRefSeq
sys.path.insert(0, '../.')
from MyException import MyException


### EXPLANATION ###
# Populates a database of genes from Cufflinks data.
# These genes are screened to guarantee the following:
#	  a valid status in the Cufflinks file (status == 'OK')
# 	a status of "Reviewed" or "Validated" or "Provisional" in refseqStatus SQL table.
#
# This populates each of the following SQL tables:
#	H1genes, HOSgenes, JurkatGenes, ENCODE1genes, ENCODE2genes
# It is possible for the same gene to appear in any or all of these tables.
# Each gene name (as in human-readable name) is only allowed once in each table.
#
# There are several problems with redundant cross-referencing in the cufflinks output.
# This is due to redundancy in the refGene SQL table. There are 3 possible scenarios:
#
# 1) Multiple refseq IDs point to the same gene name (RUFY3)
#   In this case, genes.fpkm_tracking has one record, but the coordinates given for the transcript
#   do NOT match any record in refGene. Instead, cufflinks takes the smallest start value and the
#   largest end value for all transcripts.
#   TO SOLVE:
#     a) as per normal records, select refGene.name and refGene.strand based on chrom, txStart, txEnd and name2
#     b) when no result is returned, select name, txStart, txEnd based on name2
#     c) find the largest transcript (NOTE:this choice is arbitrary)
#     d) use this record's refseq ID (refGene.name)
#     e) use the fpkm values from the genes.fpkm_tracking record (even though this is not
#        the value for this specific transcript)
#
# 2) One refseq ID has multiple records in refGene (FAM21B)
#   In this case, genes.fpkm_tracking has as many records as there are duplicates in refGene.
#   TO SOLVE:
#     a) use the genes.fpkm_tracking record with the highest FPKM value
#     b) to break a tie, use the first listed record (or whatever is programatically easiest)
#
# 3) Combination of problems (1) and (2); Multiple refseq IDs point to the same gene name AND
#    some of these refseq IDs are duplicated in refGene (DAZ4)
#   TO SOLVE:
#     a) take record with highest FPKM, as in (2) above
#     b) query for refseq ID (refGene.name) based on name2, txStart, txEnd
#     c) if more than one record is returned
#     d) store TSSID from genes.fpkm_tracking record (index of 5 with zero-based indexing)
#     e) find the record in isoforms.fpkm_tracking that has this TSSID, genename, chrom, txStart and txStop
#     f) if there is more than one record that corresponds to this, chose the highest FPKM
#     g) use the tracking_id (refseq ID) in this record
#     h) this tracking_id may need to have a '_n' suffix removed (where n is a one or more digits)
#
# 4) Special case of (1) where all isoforms have the same txStart in refGene
#   In this case, there is only one refGene record, since the largest transcript

class CufflinksGenes:
	chromList = ('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
							 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17',
							 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM')

##	acceptableSampleNameValues = ['h1', 'hos', 'jurkat', 'encode1', 'encode2']
##	genesFilePaths = ['/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/H1/genes.fpkm_tracking',
##										'/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/HOS/genes.fpkm_tracking',
##										'/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/Jurkat/genes.fpkm_tracking',
##										'/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/ENCODE/rep1/genes.fpkm_tracking',
##										'/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/ENCODE/rep2/genes.fpkm_tracking']
##	isoformsFilePaths = ['/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/H1/isoforms.fpkm_tracking',
##											 '/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/HOS/isoforms.fpkm_tracking',
##											 '/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/Jurkat/isoforms.fpkm_tracking',
##											 '/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/ENCODE/rep1/isoforms.fpkm_tracking',
##											 '/media/Thoth/shared/Sutton-Lab/data/RS_pipeline/cufflinks/ENCODE/rep2/isoforms.fpkm_tracking']
##	fileIndices = {'h1':0, 'hos':1, 'jurkat':2, 'encode1':3, 'encode2':4}
	
##	def __init__(self, sampleName):
	def __init__(self, genesDir, tableName):
		if not os.path.exists(genesDir):
			s = 'ERROR - file does not exist: {0}'.format(genesDir)
			raise MyException(s, 'CufflinksGenes.init()', True)
		self.genesDir = genesDir

		self.tableName = tableName.lower()
		print 'WARNING - this program assumes the SQL table {0} is EMPTY'.format(self.tableName)
		answer = raw_input('is this true? y/[n]: ')
		if not ('y' == answer.lower() or 'yes' == answer.lower()):
			sys.exit(1)

		self.refseq = PopulateRefSeq.PopulateRefSeq()
		self.geneDetail = {}
		self.isoforms = {}
		self.logFile = open('./logs/CufflinksGenes.log', 'w')
		return

	def __del__(self):
		self.logFile.close()

	### read_isoforms_file ###
	#     0           1           2               3       4             5       6     7       8         9     10            11            12
	# tracking_id	class_code	nearest_ref_id	gene_id	gene_short_name	tss_id	locus	length	coverage	FPKM	FPKM_conf_lo	FPKM_conf_hi	FPKM_status
	def build_isoforms_hash(self):
##		filePath = self.isoformsFilePaths[self.fileIndices[self.sampleName]]
		filePath = os.path.join(self.genesDir, 'isoforms.fpkm_tracking')
		fh = open(filePath, 'r')
		fh.readline() # throw out descriptive text header
		line = fh.readline()
		while line:
			columns = line.rstrip().split('\t')
			m = re.search('(N._\d*)', columns[0])
			refseqID = m.group(1)
			geneName = columns[3]
			tssID = columns[5]
			(chrom, start, stop) = self.parse_locus(columns[6])
			fpkm = columns[9]
			isoformInfo = (refseqID, geneName, chrom, start, stop, fpkm) # use a tuple; data shouldn't change
			if tssID in self.isoforms.keys():
				self.isoforms[tssID].append(isoformInfo)
			else:
				self.isoforms[tssID] = [isoformInfo]
			line = fh.readline()
		fh.close()
		return

	### parse_gene_file ###
	# This method reads the file line by line and processes each line.
	# Any strange "data referencing" scenarios, as discussed in the header comments, are dealt with here.
	def parse_gene_file(self):
##		filePath = self.genesFilePaths[self.fileIndices[self.sampleName]]
		filePath = os.path.join(self.genesDir, 'genes.fpkm_tracking')
		fh = open(filePath, 'r')
		fh.readline() # throw out descriptive header line
		line = fh.readline()
		while line:
			parsedDetails = self.parse_gene_details(line)
			if parsedDetails:
				(geneName, chrom, start, stop, fpkm, fpkmLo, fpkmHi, tssID) = parsedDetails

				refseqResult = self.fetch_refseq_ID(geneName, chrom, start, stop)

				if not refseqResult:
					# Scenario (1)
					self.logFile.write('(1) {0}'.format(line))
					longestIsoform = self.find_longest_transcript(geneName)
					if not longestIsoform:
						line = fh.readline()
						continue # there is no record of this gene name in refGene table
					(geneID, strand, start, stop) = longestIsoform
					geneInfo = (geneID, chrom, strand, start, stop, fpkm, fpkmLo, fpkmHi)
					self.add_record_to_gene_hash(geneName, geneInfo)

				elif len(refseqResult) > 1:
					# Scenario (3)
					self.logFile.write('(3) {0}'.format(line))
					geneID = self.find_matching_isoform(tssID, geneName, chrom, start, stop)
					if not geneID:
						raise MyException('ERROR - no result returned from find_matching_isoform()\n' + line, 'CufflinksGenes.parse_gene_file()', True)
					strand = None
					for row in refseqResult:
						(refseqID, currentStrand) = row
						if refseqID == geneID:
							strand = currentStrand
					if None == strand:
						s = 'ERROR - isoforms refseq ID {0} does not match refseq IDs associated with name {1}:\n\t'.format(geneID, geneName)
						for row in refseqResult:
							s += '{0} '.format(row[0])
						s += '\n'
						self.logFile.write(s)
						line = fh.readline()
						continue
					geneInfo = (geneID, chrom, strand, start, stop, fpkm, fpkmLo, fpkmHi)
					self.add_record_to_gene_hash(geneName, geneInfo)
					
				else:
					# normal cases and Scenario (2)
					row = refseqResult[0]
					(refseqID, strand) = row
					geneInfo = (refseqID, chrom, strand, start, stop, fpkm, fpkmLo, fpkmHi)
					self.add_record_to_gene_hash(geneName, geneInfo)
					
			line = fh.readline()
		fh.close()
		return

	### add_record_to_gene_hash ###
	# This method adds a record to self.geneDetail if it is unique to that hash key
	# or if the FPKM value is higher than the currently stored record.
	# In this way, we implicitly take care of Scenario (2) and the second half of Scenario (3)
	# The only problem with filtering by FPKM here (near the end of the pipeline) is that every record
	# in Scenario (3) is treated as being valid. Therefore, more work is done than is necessary.
	# However, I feel the program logic flows much better, this way.
	def add_record_to_gene_hash(self, geneName, geneInfo):
		if geneName in self.geneDetail.keys():
			self.logFile.write('dup {0} {1}\t{2}\n'.format(geneName, geneInfo, self.geneDetail[geneName]))
			if geneInfo[4] > self.geneDetail[geneName][4]: # compare FPKM values
				self.geneDetail[geneName] = geneInfo
		else:
			self.geneDetail[geneName] = geneInfo
		return		

	### parse_gene_details ###
	# This method examines a single line of the genes file.
	# Record format is as follows:
	#     0           1           2               3       4             5       6     7       8        9        10            11            12
	# tracking_id	class_code	nearest_ref_id	gene_id	gene_short_name	tss_id	locus	length	coverage	FPKM	FPKM_conf_lo	FPKM_conf_hi	FPKM_status
	#
	# Each value is separated out and some values are used for quality checking on the record
	# If the record is worth keeping, the following values are returned, in this order:
	#   gene name (tracking_id)
	#   chromosome (part of locus)
	#   transcription start (part of locus)
	#   transcription stop (part of locus)
	#   fpkm (FPKM)
	#   95% confidence interval range, low, then high (FPKM_conf_lo, FPKM_conf_hi)
	#   FPKM_status is NOT RETURNED, it is only used to determine if the record should be processed any further
	# If the record is not worth keeping, 'None' is returned
	def parse_gene_details(self, line):
		# break the line into individual values
		columns = line.rstrip().split('\t')

		# Parse record into individual values we can use
		geneName = columns[0]
		(chrom, start, stop) = self.parse_locus(columns[6])
		fpkm = float(columns[9])
		fpkmLo = float(columns[10])
		fpkmHi = float(columns[11])
		tssID = columns[5]
		recordStatus = columns[12]

		# don't process records with untrustworthy data
		if 'OK' != recordStatus:
			return None

		# don't process records that reside on strange chromosomes
		if chrom not in self.chromList:
			return None

		# place a minimum floor on all FPKM values
		if self.fpkm_below_minimum(fpkm): fpkm = 0
		if self.fpkm_below_minimum(fpkmLo): fpkmLo = 0
		if self.fpkm_below_minimum(fpkmHi): fpkmHi = 0

		return (geneName, chrom, start, stop, fpkm, fpkmLo, fpkmHi, tssID)
	
	### parse_locus ###
	def parse_locus(self, locus):
		m = re.search('(chr.*):(\d*)-(\d*)', locus)
		chrom = m.group(1)
		start = m.group(2)
		stop = m.group(3)
		return (chrom, start, stop)

	### fpkm_below_minimum ###
	# Change very low fpkm values to zero, for easier storage
	# This is just a cludge to prevent PostGreSQL from attempting to enter values that are too for it to handle
	def fpkm_below_minimum(self, value):
		if value < 0.000001: # 1E-6
			return True
		else:
			return False
	

	### fetch_refseq_ID ###
	# This is the first method we call to find the refseq ID to associate with a particular gene.
	def fetch_refseq_ID(self, geneName, chrom, start, stop):
##		for geneName in self.geneDetail.keys():
##			(geneID, chrom, start, stop, fpkm, fpkmLo, fpkmHi) = self.geneDetail[geneName]
		qStr = """
      SELECT name, strand
      FROM   refGene
      WHERE  name2 = '{0}'
      AND    chrom = '{1}'
      AND    txStart = {2}
      AND    txEnd = {3}
      ;""".format(geneName, chrom, start, stop)
		result = self.refseq.general_query(qStr)
		return result

	### find_longest_transcript ###
	def find_longest_transcript(self, geneName):
		maxName = None
		maxStrand = None
		maxStart = maxEnd = 0
		qStr = """
      SELECT name, strand, txStart, txEnd
      FROM   refGene
      WHERE  name2 = '{0}'
      ;""".format(geneName)
		result = self.refseq.general_query(qStr)
		
		if not result:
			self.logFile.write('unknown {0}\n'.format(geneName))
			return None # no record has refGene.name2 set to the passed in value of geneName

		for row in result:
			(name, strand, txStart, txEnd) = row
			if abs(txEnd - txStart) > abs(maxEnd - maxStart):
				maxName = name
				maxStrand = strand
				maxStart = txStart
				maxEnd = txEnd
		return (maxName, maxStrand, maxStart, maxEnd)

	### find_matching_isoform ###
	# For a given set of tssIDs, return the refseq ID with the highest fpkm value
	def find_matching_isoform(self, tssID, genesName, genesChrom, genesStart, genesStop):
		maxFPKM = 0
		highestFPKMgeneID = None
		for tss in tssID.split(','): # this field can contain several tssIDs, separated by ','
			for isoformInfo in self.isoforms[tss]:
				(refseqID, isoformName, isoformChrom, isoformStart, isoformStop, isoformFPKM) = isoformInfo
				if isoformChrom == genesChrom and isoformStart == genesStart and isoformStop == genesStop and isoformName == genesName:
					if isoformFPKM > maxFPKM:
						maxFPKM = isoformFPKM
						highestFPKMgeneID = refseqID
		return highestFPKMgeneID


	### populate_table ###
	def populate_table(self):
##		tableName = self.sampleName + 'gene' # postgresql table names are case-insensitive
		for geneName in self.geneDetail.keys():
			(geneID, chrom, strand, start, stop, fpkm, fpkmLo, fpkmHi) = self.geneDetail[geneName]
			geneInfo = (geneName, geneID, chrom, strand, start, stop, fpkm, fpkmLo, fpkmHi)
			self.refseq.insert_into_gene_table(self.tableName, geneInfo )


### read_parameters ###
def read_parameters():
	parser = OptionParser()
##	parser.add_option('-n', '--sampleName', dest='sampleName', \
##		help='sample name of the RNA-seq experiment (ie: H1, HOS, Jurkat, ENCODE1, ENCODE2)')
	parser.add_option('-g', '--genesDir', dest='genesDir', help='directory containing cufflinks results')
	parser.add_option('-t', '--tableName', dest='tableName', help='SQL table to populate')
	(parms, args) = parser.parse_args()
    
	if None == parms.genesDir or None == parms.tableName:
		print 'WARNING - both \"genes file\" and \"table name\" must be specified'
		parser.print_help()
		return None
	return parms


### main ###
if __name__ == '__main__':
	opts = read_parameters()
	if opts:
		try:
			print 'instantiating class'
			x = CufflinksGenes(opts.genesDir, opts.tableName)
			
			print 'indexing isoforms'
			x.build_isoforms_hash()
			
			print 'indexing genes'
			x.parse_gene_file()
			
			print 'populating table'
			x.populate_table()
		except MyException as exc:
			print exc.get_method()
			print exc.get_msg()