Problem

All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.

Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.

Example

given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT", return: ["AAAAACCCCC", "CCCCCAAAAA"].

JavaScript Code

function findRepeatedDnaSequences(s) {
    var result = [];
 
    var len = s.length;
    if (len < 10) {
        return result;
    }
 
    var map = {};
    map['A'] = 0;
    map['C'] = 1;
    map['G'] = 2;
    map['T'] = 3;
 
    var temp = [];
    var added = [];
 
    var hash = 0;
    for (var i = 0; i < len; i++) {
        if (i < 9) {
            //each ACGT fit 2 bits, so left shift 2
            hash = (hash << 2) + map[s.charAt(i)]; 
        } else {
            hash = (hash << 2) + map[s.charAt(i)];
            //make length of hash to be 20
            hash = hash &  (1 << 20) - 1; 
 
            if (temp.indexOf(hash)!=-1 && added.indexOf(hash)==-1) {
                result.push(s.substring(i - 9, i + 1));
                added.push(hash); //track added
            } else {
                temp.push(hash);
            }
        }
 
    }
 
    return result;
}

console.log(findRepeatedDnaSequences('AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT'));

1 comment:

Blogroll

Follow this blog by Email

Popular Posts