我有以下代码:
from Bio import AlignIO
import itertools
out=open("test.csv","a")
align = AlignIO.read("HPV16_CG.aln.fas", "fasta")
n=0
def SNP(line):
result=[]
result.append(str(n+1))
result.append(line[0])
result.append(align[y].id.rsplit("|")[3])
result.append(x)
return result
while n<len(align[0]):
line = align[:,n]
y=0
for x in line:
if line[0]!=x:
print >> out, ','.join(map(str,SNP(line)))
y=y+1
else:
y=y+1
y=0
n=n+1
out.close()
f=open("test.csv","rU")
out=open("test_2.csv","a")
lines=f.read().split()
for key, group in itertools.groupby(lines, lambda line: line.partition(',')[0]):
print >>out, ','.join(group)
out.close()
f.close()
如您所见,我目前正在编写两个文件。我真的只需要第二个文件。 有没有人有任何建议将两个“下标”合并为一个?
输入文件“HPV16_CG.aln.fas”如下所示:
>gi|333031|lcl|HPV16REF.1| Alpha-9 - Human Papillomavirus 16, complete genome.
ACTACAATAATTCATGTATAAAACTAAGGGCGTAACCGAAATCGGTTGAACCGAAACCGG
>gi|333031|gb|K02718.1|PPH16 Human papillomavirus type 16 (HPV16), complete genome
ACTACAATAATTCATGTATAAAACTAAGGGCGTAACCGAAATCGGTTGAACCGAAACCGG
>gi|196170262|gb|FJ006723.1| Human papillomavirus type 16, complete genome
ACTACAATAATTCATGTATAAAACTAAGGGCGTAACCGAAATCGGTTGAACCGAAACCGG
我非常感谢所有帮助我改进的帮助/建议!
最佳答案
最简单的做法是将文件的行保留在内存中,但我怀疑这不会起作用,因为任何有用的生物信息学文件可能都非常大。
这是通过删除全局变量使用并添加生成器函数来清理脚本的尝试,以创建从 SNP
函数返回的行,流式传输应该与您的 兼容itertools.groupby
调用。
from Bio import AlignIO
import itertools
n=0
align = AlignIO.read("HPV16_CG.aln.fas", "fasta")
def SNP(line, y, x):
"""Pass y as a parameter rather than relying on a global"""
result=[]
result.append(str(n+1))
result.append(line[0])
result.append(align[y].id.rsplit("|")[3])
result.append(x)
return result
def generate_snp_lines(align, n):
"""this is a function generator that'll produce lines without writing them to a file"""
while n<len(align[0]):
line = align[:,n]
y=0
for x in line:
if line[0]!=x:
yield ','.join(map(str,SNP(line, y, x)))
y+=1
n+=1
def main():
# let's use a context manager to open and cleanup this file for us:
with open("test.csv","a") as out:
# construct the generator:
lines = generate_snp_lines(align, n)
# pass it to itertools.groupby like we'd pass any iterable:
for key, group in itertools.groupby(lines, lambda line: line.partition(',')[0]):
print >>out, ','.join(group)
if __name__=="__main__":
main()
关于python - 使用 "Biopython"- 我怎样才能改进我的代码,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/11817495/