Hi everyone,
I am a novice at biopython, but have gotten a few things to work so far. Previously, I used biopython to pull nucleotide and protein sequences from a number of gene that were differentially expressed in my RNA-seq analysis. I am now trying to perform GO analysis on my dataset, and am trying to use biopython to gather the Entrez gene IDs (needed for the gene-2-go annotations in the GO analysis R package) from the nucleotide genbank nucleotide IDs.
My script seems to be working fine, but the problem comes after about 10-60s of running. At that point, it appears to stop querying the database and becomes "stuck". I've attempted to put in a "try-except" loop for when it gets stuck, but this doesn't seem to work. I'll post my code below along with the error message after I control-c to exit the program.
NOTE: my output file is correct up to the point where biopython stops querying the database. Every run gets "stuck" at a different point, so I don't think there is anything wrong with my files.
what the file looks like that needs to be parsed:
>ABCA2|NM_001606.4
ATGGGC...TGA
>ABHD15|NM_198147.2
ATGCCG...TAG
etc...
the output file will be identical, but with the additional Entrez IDs after the genbank IDs, e.g.:
>ABCA2|NM_001606.4|20
etc...
my code:
and the error:
any help would be great!
I am a novice at biopython, but have gotten a few things to work so far. Previously, I used biopython to pull nucleotide and protein sequences from a number of gene that were differentially expressed in my RNA-seq analysis. I am now trying to perform GO analysis on my dataset, and am trying to use biopython to gather the Entrez gene IDs (needed for the gene-2-go annotations in the GO analysis R package) from the nucleotide genbank nucleotide IDs.
My script seems to be working fine, but the problem comes after about 10-60s of running. At that point, it appears to stop querying the database and becomes "stuck". I've attempted to put in a "try-except" loop for when it gets stuck, but this doesn't seem to work. I'll post my code below along with the error message after I control-c to exit the program.
NOTE: my output file is correct up to the point where biopython stops querying the database. Every run gets "stuck" at a different point, so I don't think there is anything wrong with my files.
what the file looks like that needs to be parsed:
>ABCA2|NM_001606.4
ATGGGC...TGA
>ABHD15|NM_198147.2
ATGCCG...TAG
etc...
the output file will be identical, but with the additional Entrez IDs after the genbank IDs, e.g.:
>ABCA2|NM_001606.4|20
etc...
my code:
Code:
from Bio import Entrez import glob import re Entrez.email = "[email protected]" filenames = glob.glob("*_cds.fas") for file in filenames: print "working on %s"%file ofile = open(file) wfile = open(file+"_entrez",'w') n=0 for line in ofile: if line.startswith(">"): line = [x.strip() for x in line.split("|")] handle = Entrez.esearch(db="gene",term=line[1].strip()) EntrezID = Entrez.read(handle) EntrezID = EntrezID["IdList"][0]+"\n" wfile.write('|'.join(x for x in line+[EntrezID])) n+=1 if n%100 == 0: print "processed %s sequences"%n else: wfile.write(line) print "finished, processed %s entries"%n ofile.close() wfile.close()
Code:
KeyboardInterrupt Traceback (most recent call last) /Users/XXX/Desktop/XXX/XXX/XXX/XXX/Add_Entrez_IDs.py in <module>() 21 if line.startswith(">"): 22 line = [x.strip() for x in line.split("|")] ---> 23 handle = Entrez.esearch(db="gene",term=line[1].strip()) 24 EntrezID = Entrez.read(handle) 25 EntrezID = EntrezID["IdList"][0]+"\n" /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/Bio/Entrez/__init__.pyc in esearch(db, term, **keywds) 187 'term': term} 188 variables.update(keywds) --> 189 return _open(cgi, variables) 190 191 /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/Bio/Entrez/__init__.pyc in _open(cgi, params, post) 464 # HTTP GET 465 cgi += "?" + options --> 466 handle = _urlopen(cgi) 467 except _HTTPError as exception: 468 raise exception /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context) 152 else: 153 opener = _opener --> 154 return opener.open(url, data, timeout) 155 156 def install_opener(opener): /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout) 429 req = meth(req) 430 --> 431 response = self._open(req, data) 432 433 # post-process response /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in _open(self, req, data) 447 protocol = req.get_type() 448 result = self._call_chain(self.handle_open, protocol, protocol + --> 449 '_open', req) 450 if result: 451 return result /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args) 407 func = getattr(handler, meth_name) 408 --> 409 result = func(*args) 410 if result is not None: 411 return result /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in http_open(self, req) 1225 1226 def http_open(self, req): -> 1227 return self.do_open(httplib.HTTPConnection, req) 1228 1229 http_request = AbstractHTTPHandler.do_request_ /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in do_open(self, http_class, req, **http_conn_args) 1192 1193 try: -> 1194 h.request(req.get_method(), req.get_selector(), req.data, headers) 1195 except socket.error, err: # XXX what error? 1196 h.close() /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in request(self, method, url, body, headers) 1051 def request(self, method, url, body=None, headers={}): 1052 """Send a complete request to the server.""" -> 1053 self._send_request(method, url, body, headers) 1054 1055 def _set_content_length(self, body, method): /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in _send_request(self, method, url, body, headers) 1091 for hdr, value in headers.iteritems(): 1092 self.putheader(hdr, value) -> 1093 self.endheaders(body) 1094 1095 def getresponse(self, buffering=False): /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in endheaders(self, message_body) 1047 else: 1048 raise CannotSendHeader() -> 1049 self._send_output(message_body) 1050 1051 def request(self, method, url, body=None, headers={}): /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in _send_output(self, message_body) 891 msg += message_body 892 message_body = None --> 893 self.send(msg) 894 if message_body is not None: 895 #message_body was not a string (i.e. it is a file) and /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in send(self, data) 853 if self.sock is None: 854 if self.auto_open: --> 855 self.connect() 856 else: 857 raise NotConnected() /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in connect(self) 830 """Connect to the host and port specified in __init__.""" 831 self.sock = self._create_connection((self.host,self.port), --> 832 self.timeout, self.source_address) 833 834 if self._tunnel_host: /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.pyc in create_connection(address, timeout, source_address) 564 if source_address: 565 sock.bind(source_address) --> 566 sock.connect(sa) 567 return sock 568 /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.pyc in meth(name, self, *args) 226 227 def meth(name,self,*args): --> 228 return getattr(self._sock,name)(*args) 229 230 for _m in _socketmethods: KeyboardInterrupt:
Comment