-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparsepamlyn.py
136 lines (128 loc) · 7.24 KB
/
parsepamlyn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
# Copyright (C) 2015,2016 Mohammad Alanjary
# University of Tuebingen
# Interfaculty Institute of Microbiology and Infection Medicine
# Lab of Nadine Ziemert, Div. of Microbiology/Biotechnology
# Funding by the German Centre for Infection Research (DZIF)
#
# This file is part of ARTS
# ARTS is free software. you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version
#
# License: You should have received a copy of the GNU General Public License v3 with ARTS
# A copy of the GPLv3 can also be found at: <http://www.gnu.org/licenses/>.
import argparse, glob, tempfile, numpy as np
def runall(finput, fout=None, lt=1.0, gt=1.0, method="NG86"):
Allrecs={}
if type(finput)==list:
flist=finput
else:
flist=glob.glob(finput)
for fname in flist:
recs={}
recs['NG86']=[] #(A) Nei-Gojobori (1986) method
recs['YN2K']=[] #(B) Yang & Nielsen (2000) method
# recs['LWL85']=[] #(C) LWL85, LPB93 & LWLm methods
recs['LWL85M']=[] #(C) LWL85, LPB93 & LWLm methods
recs['LPB93']=[] #(C) LWL85, LPB93 & LWLm methods
#yang_kappa=None
mthd=""
seqlabels=[]
L=[0,0]
with open(fname,"r") as fil:
for line in fil:
try:
if line.startswith("(A)"):
mthd="A"
elif line.startswith("(B)"):
mthd="B"
elif line.startswith("(C)"):
mthd="C"
#Nei-Gojobori
if mthd=="A" and " " in line:
x = line.replace("("," ").replace(")"," ").split()
seqlabels.append(x[0])
if len(x)>1:
w = x[1::3]
dN = x[2::3]
dS = x[3::3]
for i in range(len(w)): #for i in xrange(len(w)):
# Store record: [seq#1, seq#2, omega, dN, dS]
recs["NG86"].append([len(seqlabels),i,float(w[i]),float(dN[i]),float(dS[i])])
#Yang and Nielsen
if mthd=="B" and "+-" in line and not "seq." in line:
x=line.split()
#Store record: [seq#1, seq#2, omega, dN, dS, dN std error, dS std error,]
omega=float(x[6])
if float(x[10])==99: #Flag extreme dS error
omega=-1.0
recs["YN2K"].append([int(x[0]),int(x[1]),float(x[6]),float(x[7]),omega,float(x[9]),float(x[-1])])
#if not yang_kappa:
#yang_kappa=float(x[5])
#LWL & LPB
if mthd=="C":
if "vs." in line:
L=line.split("vs.")
L[0]=int(L[0][0:L[0].index("(")])
L[1]=int(L[1][0:L[1].index("(")])
#if line.startswith("LWL85") or line.startswith("LWL85M") or line.startswith("LPB93"):
if line.startswith("LWL85m") or line.startswith("LPB93"):
x = line.split()
recs[x[0][0:x[0].index(":")].upper()].append(L+[float(v) for v in x[3:10:3][::-1]])
except Exception as e:
print("error reading values %s"%e)
#Merge and summarize
def isval(v):
return (v>=0 and v!=(v+1))
klist = recs.keys()
for k in klist:
try:
omegaAV = np.median([x[2] for x in recs[k] if isval(x[2])])
omegaSD = np.std([x[2] for x in recs[k] if isval(x[2])])
dNAV = np.median([x[3] for x in recs[k] if isval(x[3])])
dNSD = np.median([x[3] for x in recs[k] if isval(x[3])])
dSAV = np.median([x[4] for x in recs[k] if isval(x[4])])
dSSD = np.median([x[4] for x in recs[k] if isval(x[4])])
recs[k+"_AVG"]=[omegaAV,dNAV,dSAV]
recs[k+"_STD"]=[omegaSD,dNSD,dSSD]
except Exception as e:
print("Error calculating median %s"%e)
Allrecs[fname[0:fname.index(".")]]=recs
#Get gene list under threshold and calculate method agreement:
LTlist = []
GTlist = []
for k,recs in Allrecs.items():
x = recs[method+"_AVG"][0]
if np.median(x) <= lt:
LTlist.append(k)
elif np.median(x) > gt:
GTlist.append(k)
#Output result
if fout:
with open("median_"+fout,"w") as afil:
with open("std_"+fout,"w") as sfil:
#head="#GENE\t#Nei-Gojobori_omega_dN_dS\t#Yang-Nielsen_omega_dN_dS\t#Li-Wu-Luo85_omega_dN_dS\t#Li-Wu-Luo93_omega_dN_dS\t#Pamilo-Bianchi_omega_dN_dS\t#Method_median_RSD"
head="#GENE\t#Nei-Gojobori_omega_dN_dS\t#Yang-Nielsen_omega_dN_dS\t#Li-Wu-Luo93_omega_dN_dS\t#Pamilo-Bianchi_omega_dN_dS\t#Method_median_RSD"
afil.write(head+"\n")
sfil.write(head+"\n")
for k,recs in Allrecs.items():
mOmega = [recs[m][0] for m in recs if m.endswith("_AVG")]
mOAR = [np.median(mOmega),100*np.std(mOmega)/np.median(mOmega)] # median and RSD omega of all methods
afil.write("%s\t%.4f %.4f %.4f\t%.4f %.4f %.4f\t%.4f %.4f %.4f\t%.4f %.4f %.4f\t%.4f %.2f%%\n"%tuple([k]+list(recs.get("NG86_AVG",[-1,-1,-1]))+list(recs.get("YN2K_AVG",[-1,-1,-1]))+list(recs.get("LWL85M_AVG",[-1,-1,-1]))+list(recs.get("LPB93_AVG",[-1,-1,-1]))+mOAR))
sfil.write("%s\t%.4f %.4f %.4f\t%.4f %.4f %.4f\t%.4f %.4f %.4f\t%.4f %.4f %.4f\n"%tuple([k]+list(recs.get("NG86_STD",[-1,-1,-1]))+list(recs.get("YN2K_STD",[-1,-1,-1]))+list(recs.get("LWL85M_STD",[-1,-1,-1]))+list(recs.get("LPB93_STD",[-1,-1,-1]))))
afil.write("##Filtered list using "+method+"\n")
afil.write("#dN/dS>"+str(gt)+":\t"+",".join(GTlist)+"\n")
afil.write("#dN/dS<="+str(lt)+":\t"+",".join(LTlist)+"\n")
else:
return LTlist,GTlist,Allrecs
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="PAML yn00 output and return dictionary or summary table")
parser.add_argument("input",nargs="?",help="List expression of files for conversion ex: somedirectory/*.txt (default: currentdir/*.kaks)",default="*.kaks")
parser.add_argument("-o","--out",help="Store results in OUT (default: summarykaks.txt)",default="summarykaks.txt")
parser.add_argument("-m","--method",help="Use this method to threshold list ['NG86'=Nei-Gojobori(1986),'YN2K'=Yang-Nielsen(2000),'LWL85M'=Li-Wu(1993),'LPB93'=Pamilo-Bianchi(1993)] (default='NG86')",default="NG86")
parser.add_argument("-lt","--lthrsh",help="Threshold value to print all genes under: avg(omega)+std(omega) <= thrsh (default=1.0)",default=1.0)
parser.add_argument("-gt","--gthrsh",help="Threshold value to print all genes over: avg(omega)+std(omega) > thrsh (default=1.0)",default=1.0)
args = parser.parse_args()
runall(args.input,args.out,args.lthrsh,args.gthrsh,args.method)