#!/usr/bin/env python # fix a bug in cufflinks that prevents it to read BAM files in # which reference sequences (@SQ lines) have more than one colon # See http://seqanswers.com/forums/showthread.php?t=9304 import sys i_fn, o_fn = sys.argv[1:3] i = open(i_fn, 'rU') o = open(o_fn, 'w') while True: line = i.readline() if (line == ''): break line = line.strip() # header section, reference sequence dictionary if (line.startswith("@SQ")): items = line.split('\t') sn_idx = None for p, item in enumerate(items): if (item.startswith("SN:")): sn_idx = p break if (sn_idx == None): print >>sys.stderr, "ERROR: No reference sequence name found (line was '%s')" % line sys.exit(1) sn = items[sn_idx] if (sn.count(':') > 1): sn = "SN:" + sn[3:].replace(':', '_') line = '\t'.join(items[:sn_idx] + [sn] + items[sn_idx+1:]) # alignment section elif (not line.startswith('@')): items = line.split('\t') if (len(items) < 11): print >>sys.stderr, "ERROR: Invalid alignment (line was '%s')" % line sys.exit(1) rname = items[2] if (rname.count(':') > 0): line = '\t'.join(items[:2] + [rname.replace(':', '_')] + items[3:]) print >>o, line