Hi all,
I'm trying to replicate a Nature Protocols paper: genome-wide copy number analysis of single cells from Baslan et al., 2012.
In step 69 they generate synthetic 50 bp reads for each position on the hg19 genome: start at position one in the chromosome sequence and take the first 50 bases. Also create read ID strings and quality score strings. These can be Illumina or fastq format. Output these to a file and continue likewise at positions 2, 3, 4 and so on until the end of the chromosome is reached.
The script to do this can be found in the supplementary data, and I've pasted it below:
#!/usr/bin/env python
import re
import sys
import random
import string
def main():
CHROMLIST = open("/filepath/chromlist.txt", "r")
chromlist = []
for x in CHROMLIST:
if x.rstrip().find("_") == -1:
chromlist.append(x.rstrip())
INFILE = False
OUTFILE = open("/filepath/sequence.part.0.k50.txt", "w")
outfilecount = 0
counter = 0
for chrFile in chromlist:
a = chrFile.split("/")
b = a[len(a) - 1]
thisChrom = b.split(".")[0]
if INFILE:
INFILE.close()
INFILE = open(chrFile, "r")
chr = []
x = ""
y = ""
INFILE.readline()
for x in INFILE:
chr.append(x.rstrip())
x = "".join(chr)
y = x.upper()
print "after read " + thisChrom
sys.stdout.flush()
for i in range(len(y) - 50):
counter += 1
thisRead = y[i: (i+50)]
if counter % 150000000 == 0:
OUTFILE.close()
outfilecount += 1
OUTFILE = open("/filepath/sequence.part." + str(outfilecount) + ".k50.txt", "w")
OUTFILE.write("@" + thisChrom + "." + str(i))
OUTFILE.write("\n")
OUTFILE.write(thisRead)
OUTFILE.write("\n")
OUTFILE.write("+" + thisChrom + "." + str(i))
OUTFILE.write("\n")
OUTFILE.write("b" * len(thisRead))
OUTFILE.write("\n")
INFILE.close()
OUTFILE.close()
if __name__ == "__main__":
main()
I don't have a lot of experience with python, and when I run this script (after adapting the filepath), I only get 1 empty file. Is there something else I need to adapt?
Many thanks in advance,
Lien
I'm trying to replicate a Nature Protocols paper: genome-wide copy number analysis of single cells from Baslan et al., 2012.
In step 69 they generate synthetic 50 bp reads for each position on the hg19 genome: start at position one in the chromosome sequence and take the first 50 bases. Also create read ID strings and quality score strings. These can be Illumina or fastq format. Output these to a file and continue likewise at positions 2, 3, 4 and so on until the end of the chromosome is reached.
The script to do this can be found in the supplementary data, and I've pasted it below:
#!/usr/bin/env python
import re
import sys
import random
import string
def main():
CHROMLIST = open("/filepath/chromlist.txt", "r")
chromlist = []
for x in CHROMLIST:
if x.rstrip().find("_") == -1:
chromlist.append(x.rstrip())
INFILE = False
OUTFILE = open("/filepath/sequence.part.0.k50.txt", "w")
outfilecount = 0
counter = 0
for chrFile in chromlist:
a = chrFile.split("/")
b = a[len(a) - 1]
thisChrom = b.split(".")[0]
if INFILE:
INFILE.close()
INFILE = open(chrFile, "r")
chr = []
x = ""
y = ""
INFILE.readline()
for x in INFILE:
chr.append(x.rstrip())
x = "".join(chr)
y = x.upper()
print "after read " + thisChrom
sys.stdout.flush()
for i in range(len(y) - 50):
counter += 1
thisRead = y[i: (i+50)]
if counter % 150000000 == 0:
OUTFILE.close()
outfilecount += 1
OUTFILE = open("/filepath/sequence.part." + str(outfilecount) + ".k50.txt", "w")
OUTFILE.write("@" + thisChrom + "." + str(i))
OUTFILE.write("\n")
OUTFILE.write(thisRead)
OUTFILE.write("\n")
OUTFILE.write("+" + thisChrom + "." + str(i))
OUTFILE.write("\n")
OUTFILE.write("b" * len(thisRead))
OUTFILE.write("\n")
INFILE.close()
OUTFILE.close()
if __name__ == "__main__":
main()
I don't have a lot of experience with python, and when I run this script (after adapting the filepath), I only get 1 empty file. Is there something else I need to adapt?
Many thanks in advance,
Lien
Comment