The Computer Language
24.09 Benchmarks Game

k-nucleotide MicroPython program

source code

# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# submitted by Ian Osgood
# modified by Sokolov Yura
# modified by bearophile
# 2to3

from sys import stdin

def gen_freq(seq, frame, frequences):
    ns = len(seq) + 1 - frame
    frequences.clear()
    for ii in range(ns):
        nucleo = seq[ii:ii + frame]
        if nucleo in frequences:
            frequences[nucleo] += 1
        else:
            frequences[nucleo] = 1
    return ns, frequences


def sort_seq(seq, length, frequences):
    n, frequences = gen_freq(seq, length, frequences)

    l = sorted(list(frequences.items()), reverse=True, key=lambda seq_freq: (seq_freq[1],seq_freq[0]))

    print('\n'.join("%s %.3f" % (st, 100.0*fr/n) for st,fr in l))
    print()


def find_seq(seq, s, frequences):
    n,t = gen_freq(seq, len(s), frequences)
    print("%d\t%s" % (t.get(s, 0), s))


def main():
    frequences = {}
    for line in stdin:
        if line[0:3] == ">TH":
            break

    seq = []
    for line in stdin:
        if line[0] in ">;":
            break
        seq.append( line[:-1] )
    sequence = "".join(seq).upper()

    for nl in 1,2:
        sort_seq(sequence, nl, frequences)

    for se in "GGT GGTA GGTATT GGTATTTTAATT GGTATTTTAATTTATAGT".split():
        find_seq(sequence, se, frequences)

main()
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
MicroPython v1.24.0
preview.44.ge9c898cb3


 Thu, 20 Jun 2024 00:41:35 GMT

COMMAND LINE:
 /opt/src/micropython/micropython -X heapsize=1024M -X emit=native knucleotide.micropython 0 < knucleotide-input250000.txt

TIMED OUT after 7200s


PROGRAM OUTPUT:
A 30.298
T 30.157
C 19.793
G 19.752

AA 9.177
TA 9.137
AT 9.136
TT 9.094
AC 6.000
CA 5.999
GA 5.986
AG 5.985
TC 5.970
CT 5.970
GT 5.957
TG 5.956
CC 3.915
CG 3.910
GC 3.908
GG 3.902

14717	GGT
4463	GGTA