diff --git a/dynamic_programming/edit_distance.cpp b/dynamic_programming/edit_distance.cpp new file mode 100644 index 0000000..ac9715a --- /dev/null +++ b/dynamic_programming/edit_distance.cpp @@ -0,0 +1,168 @@ +#include +using namespace std; + + +class Alignment { + + string sx, sy, sa, sb; + + vector > A; + int sxL, syL; + int cost = 0; + + //Below are the functions which are returning costs involved according to operation done in aligning strings + //These are arbitrary cost + + //insertion cost + inline int insC() { + return 4; + } + + //deletion cost + inline int delC() { + return 4; + } + + //modification cost + inline int modC(char fr, char to) { + if(fr == to) + return 0; + return 3; + } + +public: + + //constructor + Alignment(string s1, string s2) : sx(s1), sy(s2), + sxL(s1.length()), syL(s2.length()) + { + //allocating size to array needed + A.resize(s2.length()+4, vector(s1.length()+4, 0)); + } + + //recurrence + int rec(int i, int j) { + return min(modC(sx[i-1], sy[j-1]) + A[i-1][j-1], min(delC() + A[i-1][j], insC() + A[i][j-1])); + //i-1, j-1 in sx, sy b/c of 0-indexing in string + } + + //building array of cost + void form_array() { + + //initialising the base needed in dp + //building up the first column, consider taking first string sx and second as null + for(int i = 0; i <= sxL; i++) { + A[i][0] = i*(delC()); + } + + //building up the first row, consider taking first string null and second as string sy + for(int i = 0; i <= syL; i++) { + A[0][i] = i*(insC()); + } + + //building up whole array from previously known values + for(int i = 1; i <= sxL; i++) { + for(int j = 1; j <= syL; j++) { + + A[i][j] = rec(i, j); + } + } + + cost = A[sxL][syL]; + + } + + //finding the alignment + void trace_back(int i, int j) { + + while(true) { + if(i == 0 || j == 0) { + break; + } + //A[i][j] will have one of the three above values from which it is derived + //so comapring from each one + if(i >= 0 && j >= 0 && rec(i, j) == modC(sx[i-1], sy[j-1]) + A[i-1][j-1]) { + sa.push_back(sx[i-1]); + sb.push_back(sy[j-1]); + i--, j--; + } + + else if((i-1) >= 0 && j >= 0 && rec(i, j) == delC() + A[i-1][j]) { + sa.push_back(sx[i-1]); //0-indexing of string + sb.push_back('-'); + i -= 1; + } + + else if(i >= 0 && (j-1) >= 0){ + sa.push_back('-'); //0-indexing of string + sb.push_back(sy[j-1]); + j -= 1; + } + } + + if(i != 0) { + while(i) { + sa.push_back(sx[i-1]); + sb.push_back('-'); + i--; + } + } + + else { + while(j) { + sa.push_back('-'); + sb.push_back(sy[j-1]); + j--; + } + } + } + + //returning the alignment + pair alignst() { + //reversing the alignments because we have formed the + //alignments from backward(see: trace_back, i, j started from m, n respectively) + reverse(sa.begin(), sa.end()); + reverse(sb.begin(), sb.end()); + return make_pair(sa, sb); + } + + + //returning the cost + int kyc() { return cost;} +}; + +int main() { + + + //converting sx to sy + string sx, sy; + sx = "GCCCTAGCG"; + sy = "GCGCAATG"; + + //standard input stream + //cin >> sx >> sy; + + pair st; + + Alignment dyn(sx, sy); + dyn.form_array(); + dyn.trace_back(sx.length(), sy.length()); + st = dyn.alignst(); + //Alignments can be different for same strings but cost will be same + + cout << "Alignments of the strings\n"; + cout << st.first << "\n"; + cout << st.second << "\n"; + cout << "Cost associated = "; + cout << dyn.kyc() << "\n"; + + /* Alignments + M - modification, D - deletion, I - insertion for converting string1 to string2 + + M M MD + string1 - GCCCTAGCG + string2 - GCGCAAT-G + + */ + +} diff --git a/dynamic_programming/edit_distance_space_efficient.cpp b/dynamic_programming/edit_distance_space_efficient.cpp new file mode 100644 index 0000000..b2c3989 --- /dev/null +++ b/dynamic_programming/edit_distance_space_efficient.cpp @@ -0,0 +1,265 @@ +/* + Dynamic-Programming + Divide & conquer for getting cost and aligned strings for problem of aligning two large strings + Reference - Wikipedia-"Hirschberg's algorithm" +*/ + + +#include +using namespace std; + +//cost associated +int cost = 0; + +//Below are the functions which are returning costs involved according to operation done in aligning strings +//These are arbitrary costs + +//insertion cost +inline int insC() { + return 4; +} + +//deletion cost +inline int delC() { + return 4; +} + +//modification cost +inline int modC(char fr, char to) { + if(fr == to) //if both characters are equal + return 0; + return 3; //if both characters are not equal +} + +//reversing the string +string rever(string s) { + int k = s.length(); + + for(int i = 0; i < k/2; i++) + swap(s[i], s[k-i-1]); + return s; +} + +//minimizing the sum of shortest paths (see:GainAlignment function), calculating next starting point +int minimize(vector ve1, vector ve2, int le) { + int sum, xmid = 0; + + for(int i = 0; i <= le; i++) { + if(i == 0) + sum = ve1[i] + ve2[le-i]; //reversing the array ve2 by taking its i element from last + + if(sum > ve1[i] + ve2[le-i]) { + sum = ve1[i] + ve2[le-i]; + xmid = i; + } + + } + return xmid; +} + +pair stringOne(string s1, string s2) { + + //building of array for case string length one of any of the string + string sa, sb; + + int m = s1.length(); + int n = s2.length(); + + vector > fone(s1.length()+5, vector(2)); + + for(int i = 0; i <= m; i++) + fone[i][0] = i*delC(); + + for(int j = 0; j <= n; j++) + fone[0][j] = j*insC(); + + for(int i = 1; i <= m; i++) { + int j; + //recurrence + for(j = 1; j <= n; j++) { + fone[i][j] = min(modC(s1[i-1], s2[j-1]) + fone[i-1][j-1], min(delC() + fone[i-1][j], insC() + fone[i][j-1])); + } + } + + int i = m, j = n; + cost += fone[i][j]; + /* + This code can be shortened as beforehand we know one of the string has length one but this gives general idea of a + how a backtracking can be done is case of general strings length given we can use m*n space + */ + //backtracking in case the length is one of string + while(true) { + + if(i == 0 || j == 0) + break; + + //fone[i][j] will have one of the three above values from which it is derived so comapring from each one + + if(i >= 0 && j >= 0 && fone[i][j] == modC(s1[i-1], s2[j-1]) + fone[i-1][j-1]) { + sa.push_back(s1[i-1]); + sb.push_back(s2[j-1]); + i--; j--; + } + + else if((i-1) >= 0 && j >= 0 && fone[i][j] == delC() + fone[i-1][j]) { + sa.push_back(s1[i-1]); + sb.push_back('-'); + i-=1; + } + + else if(i >= 0 && (j-1) >= 0 && fone[i][j-1] == insC() + fone[i][j-1]){ + sa.push_back('-'); + sb.push_back(s2[j-1]); + j-=1; + } + } + + //continue backtracking + if(i != 0) { + while(i) { + sa.push_back(s1[i-1]); + sb.push_back('-'); + i--; + } + } + + else { + while(j) { + sa.push_back('-'); + sb.push_back(s2[j-1]); + j--; + } + } + + //strings obtained are reversed alignment because we have started from i = m, j = n + reverse(sa.begin(), sa.end()); + reverse(sb.begin(), sb.end()); + + return make_pair(sa, sb); + +} + +//getting the cost associated with alignment +vector SpaceEfficientAlignment(string s1, string s2) { + + //space efficient version + int m = s1.length(); + int n = s2.length(); + + vector > array2d(m+5, vector(2)); + + //base conditions + for(int i = 0; i <= m; i++) + array2d[i][0] = i*(delC()); + + for(int j = 1; j <= n; j++) { + + array2d[0][1] = j*(insC()); + + //recurrence + for(int i = 1; i <= m; i++) { + array2d[i][1] = min(modC(s1[i-1], s2[j-1]) + array2d[i-1][0], min(delC() + array2d[i-1][1], insC() + array2d[i][0])); + } + + for(int i = 0; i <= m; i++) + array2d[i][0] = array2d[i][1]; + } + + //returning the last column to get the row element x in n/2 column: see GainAlignment function + vector vec(m+1); + for(int i = 0; i <= m; i++) { + vec[i] = array2d[i][1]; + } + + return vec; +} + +pair GainAlignment(string s1, string s2) { + + string te1, te2; //for storing alignments + int l1 = s1.length(); + int l2 = s2.length(); + + //trivial cases of length = 0 or length = 1 + if(l1 == 0) { + for(int i = 0; i < l2; i++) { + te1.push_back('-'); + te2.push_back(s2[i]); + cost += insC(); + } + } + + else if(l2 == 0) { + for(int i = 0; i < l1; i++) { + te1.push_back(s1[i]); + te2.push_back('-'); + cost += delC(); + } + } + + else if(l1 == 1 || l2 == 1) { + pair temp = stringOne(s1, s2); + te1 = temp.first; + te2 = temp.second; + } + + //main divide and conquer + else { + int ymid = l2/2; + /* + We know edit distance problem can be seen as shortest path from initial(0,0) to (l1,l2) + Now, here we are seeing it in two parts from (0,0) to (i,j) and from (i+1,j+1) to (m,n) + and we will see for which i it is getting minimize. + */ + + vector ScoreL = SpaceEfficientAlignment(s1, s2.substr(0, ymid)); //for distance (0,0) to (i,j) + vector ScoreR = SpaceEfficientAlignment(rever(s1), rever(s2.substr(ymid, l2-ymid))); //for distance (i+1,j+1) to (m,n) + + int xmid = minimize(ScoreL, ScoreR, l1); //minimizing the distance + + pair temp = GainAlignment(s1.substr(0, xmid), s2.substr(0, ymid)); + te1 = temp.first; te2 = temp.second; //storing the alignment + + temp = GainAlignment(s1.substr(xmid, l1-xmid), s2.substr(ymid, l2-ymid)); + te1 += temp.first; te2 += temp.second; //storing the alignment + + } + + return make_pair(te1, te2); +} + + +int main() { + string s1, s2; + s1 = "GCCCTAGCG"; + s2 = "GCGCAATG"; + + //cin >> s1 >> s2; /*standard input stream*/ + + /* + If reading from strings from two files + */ + // ifstream file1("num.txt"); + // ifstream file2("nu.txt"); + // getline(file1, s1); + // file1.close(); + + // getline(file2, s2); + // file2.close(); + + pair temp = GainAlignment(s1, s2); + + //Alignments can be different for same strings but cost will be same + cout << "Alignments of strings\n"; + cout << temp.first << "\n"; + cout << temp.second << "\n"; + cout << "Cost associated = " << cost << "\n"; + + /* Alignments + M - modification, D - deletion, I - insertion for converting string1 to string2 + + M M MD + string1 - GCCCTAGCG + string2 - GCGCAAT-G + + */ +}