source code
# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# Naive transliteration from bearophile's program
# contributed by Isaac Gouy
from sys import stdin
def seq_lines():
for line in stdin:
if line.startswith(">THREE"):
break
lines = []
for line in stdin:
if line.startswith(">"):
break
lines.append( line[:-1] )
return lines
def base_counts(bases, seq):
counts = {}
size = len(seq) + 1 - bases
for i in range(size):
nucleo = seq[i: i + bases]
if nucleo in counts:
counts[nucleo] += 1
else:
counts[nucleo] = 1
return counts
def sorted_freq(bases, seq):
keysValues = base_counts(bases, seq).items()
size = len(seq) + 1 - bases
sorted_ = sorted(keysValues, reverse=True, key=lambda kv: kv[1])
return [ (kv[0], 100.0 * kv[1] / size) for kv in sorted_ ]
def specific_count(code, seq):
return base_counts(len(code), seq).get(code,0)
def main():
lines = seq_lines()
seq = "".join([s.upper() for s in lines])
for base in 1,2:
for kv in sorted_freq(base, seq):
print("%s %.3f" % (kv[0], kv[1]))
print()
for code in "GGT", "GGTA", "GGTATT", \
"GGTATTTTAATT", "GGTATTTTAATTTATAGT":
print("%d\t%s" % (specific_count(code, seq), code))
if __name__ == '__main__':
main()
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
MicroPython v1.24.0
preview.44.ge9c898cb3
Thu, 21 Nov 2024 02:38:31 GMT
COMMAND LINE:
/opt/src/micropython/micropython -X heapsize=1024M -X emit=native knucleotide.micropython-8.micropython 0 < knucleotide-input250000.txt
TIMED OUT after 7200s
PROGRAM OUTPUT:
A 30.298
T 30.157
C 19.793
G 19.752
AA 9.177
TA 9.137
AT 9.136
TT 9.094
AC 6.000
CA 5.999
GA 5.986
AG 5.985
TC 5.970
CT 5.970
GT 5.957
TG 5.956
CC 3.915
CG 3.910
GC 3.908
GG 3.902
14717 GGT
4463 GGTA