The Q6600
Benchmarks Game

k-nucleotide Python 3 program

source code

# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# submitted by Ian Osgood
# modified by Sokolov Yura
# modified by bearophile
# modified by xfm for parallelization
# modified by Justin Peel 
# modified by Jean-Baptiste Lamy 

from sys import stdin
from collections import defaultdict

def gen_freq(frame):
    global sequence
    frequences = defaultdict(int)
    if frame == 1:
        for nucleo in sequence:
            frequences[nucleo] += 1
    else:
        for ii in range(len(sequence) - frame + 1) :
            frequences[sequence[ii : ii + frame]] += 1
    return frequences

def gen_result(arg):
    if isinstance(arg, int):
        frequences = gen_freq(arg)
        n = sum(frequences.values())
        l = sorted(frequences.items(), reverse = True, key = lambda seq_freq: seq_freq[::-1])
        return "".join("%s %.3f\n" % (st, 100.0 * fr / n) for st, fr in l) + "\n"
    else:
        frequences = gen_freq(len(arg))
        return "%s\t%s\n" % (frequences[arg], arg)

def prepare() :
    for line in stdin:
        if (line[0] == ">") and (line[1:3] == "TH"):
            break
        
    seq = ""
    for line in stdin:
        if line[0] == ">":
            break
        seq += line
    return seq.upper().replace('\n','')

def main():
    global sequence
    sequence = prepare()
    
    from concurrent.futures import ProcessPoolExecutor
    
    with ProcessPoolExecutor() as executor:
        r = executor.map(gen_result, ["GGTATTTTAATTTATAGT", "GGTATTTTAATT", "GGTATT", "GGTA", "GGT", 2, 1])
        
    print("".join(reversed(list(r))), end = "")
    
    
if __name__=='__main__' :
    main()
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Python 3.8.0


Mon, 11 May 2020 01:45:40 GMT

MAKE:
mv knucleotide.python3 knucleotide.py
mypy .
Success: no issues found in 1 source file

5.97s to complete and log all make actions

COMMAND LINE:
/opt/src/Python-3.8.0/bin/python3 -OO knucleotide.py 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT