Bioinformatics Algorithms
Median String
Back to Index
Finds the median string of length k in a group of sequences.
Algorithm is very slow and only works for relatively small values for k and
relatively short sequences.
#!/usr/bin/python
import sys
#lines = sys.stdin.read().splitlines()
#k=int(lines[0])
#seqs=[]
#for x in lines[1:]:
# seqs.append(x)
seqs = ["CTCGATGAGTAGGAAAGTAGTTTCACTGGGCGAACCACCCCGGCGCTAATCCTAGTGCCC",
"GCAATCCTACCCGAGGCCACATATCAGTAGGAACTAGAACCACCACGGGTGGCTAGTTTC",
"GGTGTTGAACCACGGGGTTAGTTTCATCTATTGTAGGAATCGGCTTCAAATCCTACACAG"]
k=7
def HammingDistance(p,q):
score=0
for n in range(0,len(p)):
if p[n] != q[n]:
score = score + 1
return score
def dispattstring(p, s2):
klen = len(p)
dis = 0
for s in s2:
hamd = 100
for i in range(0,len(s)-klen+1):
if hamd > HammingDistance(p,s[i:i+klen]):
hamd = HammingDistance(p,s[i:i+klen])
dis = dis + hamd
return dis
def N2P(i,k):
if k==1:
return N2S(i)
pref = Q(i,4)
r = REM(i,4)
symbol = N2S(r)
pP = N2P(pref,k-1)
return pP + symbol
def Q(i,d):
return i//d
def REM(i,d):
return i%d
def N2S(r):
if r==0:
return "A"
elif r==1:
return "C"
elif r==2:
return "G"
else:
return "T"
def MedianString(seqs, k):
distance = 888
median=""
for i in range(0,pow(4,k)):
patt=N2P(i,k)
score = dispattstring(patt,seqs)
if distance>score:
distance = score
median=patt
return median
output = MedianString(seqs,k)
print output
Main Index