Bioinformatics Algorithms
Improved K-mer Count
Back to Index
This script performs the same task as the Counting K-mers script
on page one, but it employees the N2P and P2N functions and uses
a list instead of a dictionary to implement a more efficient algorithm.
TO RUN THIS SCRIPT:
./KCOUNT.py < pdic_data.txt
#!/usr/bin/python
import math
import sys
def P2N(p):
if p == "":
return 0
symbol = LastSymbol(p)
prefix = Prefix(p)
return 4* P2N(prefix) + S2N(symbol)
def LastSymbol(p):
index = len(p)-1
return p[index]
def Prefix(p):
index = len(p)-1
return p[:index]
def S2N(s):
if s=="A":
return 0
elif s=="C":
return 1
elif s=="G":
return 2
else:
return 3
#---------
def N2P(i,k):
if k==1:
return N2S(i)
pref = Q(i,4)
r = REM(i,4)
symbol = N2S(r)
pP = N2P(pref,k-1)
return pP + symbol
def Q(i,d):
return i//d
def REM(i,d):
return i%d
def N2S(r):
if r==0:
return "A"
elif r==1:
return "C"
elif r==2:
return "G"
else:
return "T"
#---------
freqARRAY = []
def ComputingFrequencies(text,k):
for n in range(0,int(math.pow(4,k))):
freqARRAY.append(0)
for i in range(0,len(text)-k+1):
pattern = text[i:i+k]
j = P2N(pattern)
freqARRAY[j] = freqARRAY[j]+1
flist = []
largest = 0
for n in range(0,len(freqARRAY)-1):
if freqARRAY[n]>largest:
largest=freqARRAY[n]
for n in range(0,len(freqARRAY)-1):
if freqARRAY[n]==largest:
flist.append(N2P(n,k))
print largest
return flist
lines = sys.stdin.read().splitlines()
seq=lines[0]
k=int(lines[1])
output = ComputingFrequencies(seq,k)
print output
Main Index