The Computer Language
24.12 Benchmarks Game

k-nucleotide Java #6 program

source code

/* The Computer Language Benchmarks Game
 https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 
 contributed by James McIlree
 ByteString code thanks to Matthieu Bentot and The Anh Tran
 */

import java.util.*;
import java.io.*;
import java.util.concurrent.*;

public class knucleotide {
    static ArrayList<Callable< Map<ByteString, ByteString> > > createFragmentTasks(final byte[] sequence, int[] fragmentLengths) {
	ArrayList<Callable<Map<ByteString, ByteString>>> tasks = new ArrayList<Callable<Map<ByteString, ByteString>>>();
	for (int fragmentLength : fragmentLengths) {
	    for (int index=0; index<fragmentLength; index++) {
		final int offset = index;
		final int finalFragmentLength = fragmentLength;
		tasks.add(new Callable<Map<ByteString, ByteString>>() {
		    public Map<ByteString, ByteString> call() {
			return createFragmentMap(sequence, offset, finalFragmentLength);
		    }
		});
	    }
	}
	return tasks;
    }
    	
    static Map<ByteString, ByteString> createFragmentMap(byte[] sequence, int offset, int fragmentLength) {
	HashMap<ByteString, ByteString> map = new HashMap<ByteString, ByteString>();
	int lastIndex = sequence.length - fragmentLength + 1;
	ByteString key = new ByteString(fragmentLength);	
	for (int index=offset; index<lastIndex; index+=fragmentLength) {
	    key.calculateHash(sequence, index);
	    ByteString fragment = map.get(key);
	    if (fragment != null) {
		fragment.count++;
	    } else {
		map.put(key, key);
		key = new ByteString(fragmentLength);
	    }
	}

	return map;
    }
        
    // Destructive!
    static Map<ByteString, ByteString> sumTwoMaps(Map<ByteString, ByteString> map1, Map<ByteString, ByteString> map2) {
	for (Map.Entry<ByteString, ByteString> entry : map2.entrySet()) {
	    ByteString sum = map1.get(entry.getKey());
	    if (sum != null)
		sum.count += entry.getValue().count;
	    else
		map1.put(entry.getKey(), entry.getValue());
	}
	return map1;
    }
    
    static String writeFrequencies(float totalCount, Map<ByteString, ByteString> frequencies) {
	SortedSet<ByteString> list = new TreeSet<ByteString>(frequencies.values());
	StringBuilder sb = new StringBuilder();
	for (ByteString k : list)
	    sb.append(String.format("%s %.3f\n", k.toString().toUpperCase(), (float)(k.count) * 100.0f / totalCount));
	
	return sb.append('\n').toString();
    }
    
    static String writeCount(List<Future<Map<ByteString, ByteString>>> futures, String nucleotideFragment) throws Exception {
	ByteString key = new ByteString(nucleotideFragment.length());
	key.calculateHash(nucleotideFragment.getBytes(), 0);
	
	int count = 0;
	for (Future<Map<ByteString, ByteString>> future : futures) {
	    ByteString temp = future.get().get(key);
	    if (temp != null) count += temp.count;
	}
	
	return count + "\t" + nucleotideFragment.toUpperCase() + '\n';
    }
    
    public static void main (String[] args) throws Exception {
	String line;
	BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
	while ((line = in.readLine()) != null) {
	    if (line.startsWith(">THREE")) break;
	}
	    
	ByteArrayOutputStream baos = new ByteArrayOutputStream();
        byte bytes[] = new byte[100];
        while((line = in.readLine()) != null) {
	    if (line.length() > bytes.length)
		bytes = new byte[line.length()];
	    
	    int i;
	    for(i=0; i<line.length(); i++)
		bytes[i] = (byte)line.charAt(i);
	    baos.write(bytes, 0, i);
        }
	
	byte[] sequence = baos.toByteArray();
		
	ExecutorService pool = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
	int[] fragmentLengths = { 1, 2, 3, 4, 6, 12, 18 };
	List<Future<Map<ByteString, ByteString>>> futures = pool.invokeAll(createFragmentTasks(sequence, fragmentLengths));
	pool.shutdown();
	
	StringBuilder sb = new StringBuilder();

	sb.append(writeFrequencies(sequence.length, futures.get(0).get()));
	sb.append(writeFrequencies(sequence.length - 1, sumTwoMaps(futures.get(1).get(), futures.get(2).get())));
	
	String[] nucleotideFragments = { "ggt", "ggta", "ggtatt", "ggtattttaatt", "ggtattttaatttatagt" };
	for (String nucleotideFragment : nucleotideFragments) {
	    sb.append(writeCount(futures, nucleotideFragment));
	}
	
	System.out.print(sb.toString());	
    }
    
    static final class ByteString implements Comparable<ByteString> {
        public int hash, count=1;
        public final byte bytes[];
	
        public ByteString(int size) {
            bytes = new byte[size];
        }
	
        public void calculateHash(byte k[], int offset) {
	    int temp = 0;
            for (int i=0; i<bytes.length; i++) {
		byte b = k[offset+i];
                bytes[i] = b;
                temp = temp * 31 + b;
            }
	    hash = temp;
        }
	
        public int hashCode() {
            return hash;
        }
	
        public boolean equals(Object obj) {
	    return Arrays.equals(bytes, ((ByteString)obj).bytes);
        }
	
        public int compareTo(ByteString other) {
            return other.count - count;
        }
	
	public String toString() {
	    return new String(bytes);
	}
    }
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
java 23 2024-09-17
Java HotSpot(TM) 64-Bit Server VM
(build 23+37-2369, 
mixed mode, sharing)


 Fri, 20 Sep 2024 03:54:26 GMT

MAKE:
mv knucleotide.java-6.java knucleotide.java
/opt/src/jdk-23/bin/javac -d . -cp .:/opt/src/java-libs/fastutil-8.3.1.jar  knucleotide.java

2.24s to complete and log all make actions

COMMAND LINE:
 /opt/src/jdk-23/bin/java  -cp .:/opt/src/java-libs/fastutil-8.3.1.jar knucleotide 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT