The Computer Language
24.11 Benchmarks Game

k-nucleotide MicroPython #8 program

source code

# The Computer Language Benchmarks Game
#   https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
#   Naive transliteration from bearophile's program
#   contributed by Isaac Gouy 

from sys import stdin

def seq_lines():
    for line in stdin:
        if line.startswith(">THREE"):
            break
    lines = []
    for line in stdin:
        if line.startswith(">"):
            break
        lines.append( line[:-1] )       
    return lines
    
def base_counts(bases, seq):  
    counts = {}
    size = len(seq) + 1 - bases
    for i in range(size):  
        nucleo = seq[i: i + bases]  
        if nucleo in counts:   
            counts[nucleo] += 1  
        else:
            counts[nucleo] = 1
    return counts               
    
def sorted_freq(bases, seq):  
    keysValues = base_counts(bases, seq).items()
    size = len(seq) + 1 - bases    
    sorted_ =  sorted(keysValues, reverse=True, key=lambda kv: kv[1])     
    return [ (kv[0], 100.0 * kv[1] / size) for kv in sorted_ ]  
      
def specific_count(code, seq):  
    return base_counts(len(code), seq).get(code,0)   
    
def main():
    lines = seq_lines()
    seq = "".join([s.upper() for s in lines])
        
    for base in 1,2:        
        for kv in sorted_freq(base, seq):
           print("%s %.3f" % (kv[0], kv[1]))
        print()      
      
    for code in "GGT", "GGTA", "GGTATT", \
            "GGTATTTTAATT", "GGTATTTTAATTTATAGT":     
        print("%d\t%s" % (specific_count(code, seq), code))       
 
if __name__ == '__main__':
  main()  
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
MicroPython v1.24.0
preview.44.ge9c898cb3


 Thu, 21 Nov 2024 02:38:31 GMT

COMMAND LINE:
 /opt/src/micropython/micropython -X heapsize=1024M -X emit=native knucleotide.micropython-8.micropython 0 < knucleotide-input250000.txt

TIMED OUT after 7200s


PROGRAM OUTPUT:
A 30.298
T 30.157
C 19.793
G 19.752

AA 9.177
TA 9.137
AT 9.136
TT 9.094
AC 6.000
CA 5.999
GA 5.986
AG 5.985
TC 5.970
CT 5.970
GT 5.957
TG 5.956
CC 3.915
CG 3.910
GC 3.908
GG 3.902

14717	GGT
4463	GGTA