Bioinformatics Algorithms

Median String

Back to Index

Finds the median string of length k in a group of sequences.

Algorithm is very slow and only works for relatively small values for k and relatively short sequences.
#!/usr/bin/python import sys #lines = sys.stdin.read().splitlines() #k=int(lines[0]) #seqs=[] #for x in lines[1:]: # seqs.append(x) seqs = ["CTCGATGAGTAGGAAAGTAGTTTCACTGGGCGAACCACCCCGGCGCTAATCCTAGTGCCC", "GCAATCCTACCCGAGGCCACATATCAGTAGGAACTAGAACCACCACGGGTGGCTAGTTTC", "GGTGTTGAACCACGGGGTTAGTTTCATCTATTGTAGGAATCGGCTTCAAATCCTACACAG"] k=7 def HammingDistance(p,q): score=0 for n in range(0,len(p)): if p[n] != q[n]: score = score + 1 return score def dispattstring(p, s2): klen = len(p) dis = 0 for s in s2: hamd = 100 for i in range(0,len(s)-klen+1): if hamd > HammingDistance(p,s[i:i+klen]): hamd = HammingDistance(p,s[i:i+klen]) dis = dis + hamd return dis def N2P(i,k): if k==1: return N2S(i) pref = Q(i,4) r = REM(i,4) symbol = N2S(r) pP = N2P(pref,k-1) return pP + symbol def Q(i,d): return i//d def REM(i,d): return i%d def N2S(r): if r==0: return "A" elif r==1: return "C" elif r==2: return "G" else: return "T" def MedianString(seqs, k): distance = 888 median="" for i in range(0,pow(4,k)): patt=N2P(i,k) score = dispattstring(patt,seqs) if distance>score: distance = score median=patt return median output = MedianString(seqs,k) print output

SOURCE CODE:
MEDSTR.py

Main Index