#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#include<math.h>
#include"SP.h"


// takes a dna sequence of size (arg1) and translate it to a peptide (arg2). There 6 ways to do that (arg3).
// The size N of the dna Short-Read line is in (arg4).
void Translate (char*, char*, int, int);

// read the dna (arg1) backwards and put the results in dnaComp (arg2).
// The size N of the dna Short-Read line is in (arg3).
void Complete (char*, char*, int);

// strcpy
void Copy (char*, char*);

// take a SP (arg1) and lookfor it using KMP algo in a peptide (arg2), using the KMP table (arg3) and the length of SP(arg4).
// The size of the translated PP, L, is in arg5.
int KMPSearch (char*, char*, int*, int, int);


// argv[0] is always the excutable file, argv[1] is the dna SHORT READ sequence file
// argv[2] is the SPs file and argv[3] is the count output file
int main (int argc, char* argv[]) {

	int N, L;
	int i, t;
	char tmp[200000];
	char* dna;
	char* dnaComp;

	char** pep;
	FILE* dnaSeq;

	// Initialize the Specific Pepetides
	SP* sps = (SP*) malloc(Nsp * sizeof(SP));
	initSP(sps, argv);

	// Set Pointer to the the 6 possible peptides
	pep = (char**)malloc(6*sizeof(char*));
	
	// open the file that contains the N-size dna seq
	dnaSeq = fopen(argv[1],"r");

	// initialize line number (t), Dna size (N), the associated PP size (L)
	t = 0; N = 0; L = 0;

	// read one line at a time
	while(fgets(tmp, 200000, dnaSeq)!=NULL) { 

		t++; printf("%d\n",t);
    
		// determine dna SHORT READ size, N, and the PP associated size
		N = strlen(tmp) - 1; 
		L = (int)floor(N/3);

		// initialize 'dna' & 'dnaComp'
		dna = (char*)malloc((N+1)*sizeof(char));
		//strcpy(dna, tmp);
		Copy(dna, tmp);
		dnaComp = (char*)malloc((N+1)*sizeof(char));


		// Intialize the 6 possible peptides
		for (i=0; i < 6; i++) {
			pep[i] = (char*)malloc((L+1)*sizeof(char));
		}

		// translate the dna to peptide in FF way (rotation 3 )
		Translate(dna, pep[0], 0, N);
		Translate(dna, pep[1], 1, N);
		Translate(dna, pep[2], 2, N);
		
		// read the dna backwards
		Complete(dna, dnaComp, N);
		
		// translate the dnaComp (also 3 possibilities)
		Translate(dnaComp, pep[3], 0, N);
		Translate(dnaComp, pep[4], 1, N);
		Translate(dnaComp, pep[5], 2, N);

		
		// Go over SPs and see if there is a match using KMP
		for (i=0; i < Nsp; i++) {

			if ( KMPSearch (sps[i]._seq, pep[0], sps[i]._table, sps[i]._length, L) >= 0) {
				sps[i]._sr[sps[i]._count] = t;
				sps[i]._frame[sps[i]._count] = 0;
				sps[i]._count++;
				sps[i]._sr = (int*)realloc(sps[i]._sr,(sps[i]._count+1)*sizeof(int));
				sps[i]._frame = (int*)realloc(sps[i]._frame,(sps[i]._count+1)*sizeof(int));
			}

			if ( KMPSearch (sps[i]._seq, pep[1], sps[i]._table, sps[i]._length, L) >= 0) {
				sps[i]._sr[sps[i]._count] = t;
				sps[i]._frame[sps[i]._count] = 1;
				sps[i]._count++;
				sps[i]._sr = (int*)realloc(sps[i]._sr,(sps[i]._count+1)*sizeof(int));
				sps[i]._frame = (int*)realloc(sps[i]._frame,(sps[i]._count+1)*sizeof(int));
			}
			
			if ( KMPSearch (sps[i]._seq, pep[2], sps[i]._table, sps[i]._length, L) >= 0) {
				sps[i]._sr[sps[i]._count] = t;
				sps[i]._frame[sps[i]._count] = 2;
				sps[i]._count++;
				sps[i]._sr = (int*)realloc(sps[i]._sr,(sps[i]._count+1)*sizeof(int));
				sps[i]._frame = (int*)realloc(sps[i]._frame,(sps[i]._count+1)*sizeof(int));
			}

			if ( KMPSearch (sps[i]._seq, pep[3], sps[i]._table, sps[i]._length, L) >= 0) {
				sps[i]._sr[sps[i]._count] = t;
				sps[i]._frame[sps[i]._count] = 3;
				sps[i]._count++;
				sps[i]._sr = (int*)realloc(sps[i]._sr,(sps[i]._count+1)*sizeof(int));
				sps[i]._frame = (int*)realloc(sps[i]._frame,(sps[i]._count+1)*sizeof(int));
			}

			if ( KMPSearch (sps[i]._seq, pep[4], sps[i]._table, sps[i]._length, L) >= 0) {
				sps[i]._sr[sps[i]._count] = t;
				sps[i]._frame[sps[i]._count] = 4;
				sps[i]._count++;
				sps[i]._sr = (int*)realloc(sps[i]._sr,(sps[i]._count+1)*sizeof(int));
				sps[i]._frame = (int*)realloc(sps[i]._frame,(sps[i]._count+1)*sizeof(int));
			}

			if ( KMPSearch (sps[i]._seq, pep[5], sps[i]._table, sps[i]._length, L) >= 0) {
				sps[i]._sr[sps[i]._count] = t;
				sps[i]._frame[sps[i]._count] = 5;
				sps[i]._count++;
				sps[i]._sr = (int*)realloc(sps[i]._sr,(sps[i]._count+1)*sizeof(int));
				sps[i]._frame = (int*)realloc(sps[i]._frame,(sps[i]._count+1)*sizeof(int));
			}


		}
		
		// free peptides
		for (i=0; i < 6; i++) {
				free(pep[i]);
		}
		// free dna
		free(dna);
		free(dnaComp);

		
	}

	// print to files the count of sps
	printSP(sps, argv);

	return 1;

}


void Copy(char* s1, char* s2) {
	int i, N;
	N = strlen(s2);
	for (i=0; i < N ; i++) {
		
		s1[i] = s2[i];

	}
	//s1[i] = '\0';
}


// W is the word (a SP) to search in P (Pepetide)
int KMPSearch (char* W, char* P, int* T, int Wsize, int L) {
	// i is the index in W, m the index of P 
	int i = 0;
	int m = 0;

	// while m+i is less than the length of P
    while (m + i < L) {
        if (W[i] == P[m + i]) {
            i = i + 1;
			// if i equals the length of W,
            if (i == Wsize) 
                return m;
		}
        else {
            m = m + i - T[i];
            if (i > 0)
                i = T[i];
		}
	}
	// if u r here, there is no match
    return -1;

}



void Translate (char* d, char* p, int m, int N) {
	int i, j;
	char tmp[4];
	// go over the dna sequence and take 3 letters together starting at m
	for (i=m, j=0; i<N-2+m; i++) {
		

		tmp[(i-m)%3] = d[i];
		
		if ( (i-m)%3 == 2 ) {
			tmp[3] = '\0';

			//T..
			if (!strcmp(tmp,"TTT")) { p[j] = 'F'; j++; }
			if (!strcmp(tmp,"TTC")) { p[j] = 'F'; j++; }
			if (!strcmp(tmp,"TTA")) { p[j] = 'L'; j++; }
			if (!strcmp(tmp,"TTG")) { p[j] = 'L'; j++; }
			
			if (!strcmp(tmp,"TCT")) { p[j] = 'S'; j++; }
			if (!strcmp(tmp,"TCC")) { p[j] = 'S'; j++; }
			if (!strcmp(tmp,"TCA")) { p[j] = 'S'; j++; }
			if (!strcmp(tmp,"TCG")) { p[j] = 'S'; j++; }
			
			if (!strcmp(tmp,"TAT")) { p[j] = 'Y'; j++; }
			if (!strcmp(tmp,"TAC")) { p[j] = 'Y'; j++; }
			if (!strcmp(tmp,"TAA")) { p[j] = '*'; j++; }
			if (!strcmp(tmp,"TAG")) { p[j] = '*'; j++; }
			
			if (!strcmp(tmp,"TGT")) { p[j] = 'C'; j++; }
			if (!strcmp(tmp,"TGC")) { p[j] = 'C'; j++; }
			if (!strcmp(tmp,"TGA")) { p[j] = '*'; j++; }
			if (!strcmp(tmp,"TGG")) { p[j] = 'W'; j++; }
			
			//C..
			if (!strcmp(tmp,"CTT")) { p[j] = 'L'; j++; }
			if (!strcmp(tmp,"CTC")) { p[j] = 'L'; j++; }
			if (!strcmp(tmp,"CTA")) { p[j] = 'L'; j++; }
			if (!strcmp(tmp,"CTG")) { p[j] = 'L'; j++; }
			
			if (!strcmp(tmp,"CCT")) { p[j] = 'P'; j++; }
			if (!strcmp(tmp,"CCC")) { p[j] = 'P'; j++; }
			if (!strcmp(tmp,"CCA")) { p[j] = 'P'; j++; }
			if (!strcmp(tmp,"CCG")) { p[j] = 'P'; j++; }
			
			if (!strcmp(tmp,"CAT")) { p[j] = 'H'; j++; }
			if (!strcmp(tmp,"CAC")) { p[j] = 'H'; j++; }
			if (!strcmp(tmp,"CAA")) { p[j] = 'Q'; j++; }
			if (!strcmp(tmp,"CAG")) { p[j] = 'Q'; j++; }
			
			if (!strcmp(tmp,"CGT")) { p[j] = 'R'; j++; }
			if (!strcmp(tmp,"CGC")) { p[j] = 'R'; j++; }
			if (!strcmp(tmp,"CGA")) { p[j] = 'R'; j++; }
			if (!strcmp(tmp,"CGG")) { p[j] = 'R'; j++; }
			
			//A..
			if (!strcmp(tmp,"ATT")) { p[j] = 'I'; j++; }
			if (!strcmp(tmp,"ATC")) { p[j] = 'I'; j++; }
			if (!strcmp(tmp,"ATA")) { p[j] = 'I'; j++; }
			if (!strcmp(tmp,"ATG")) { p[j] = 'M'; j++; }
			
			if (!strcmp(tmp,"ACT")) { p[j] = 'T'; j++; }
			if (!strcmp(tmp,"ACC")) { p[j] = 'T'; j++; }
			if (!strcmp(tmp,"ACA")) { p[j] = 'T'; j++; }
			if (!strcmp(tmp,"ACG")) { p[j] = 'T'; j++; }
			
			if (!strcmp(tmp,"AAT")) { p[j] = 'N'; j++; }
			if (!strcmp(tmp,"AAC")) { p[j] = 'N'; j++; }
			if (!strcmp(tmp,"AAA")) { p[j] = 'K'; j++; }
			if (!strcmp(tmp,"AAG")) { p[j] = 'K'; j++; }
			
			if (!strcmp(tmp,"AGT")) { p[j] = 'S'; j++; }
			if (!strcmp(tmp,"AGC")) { p[j] = 'S'; j++; }
			if (!strcmp(tmp,"AGA")) { p[j] = 'R'; j++; }
			if (!strcmp(tmp,"AGG")) { p[j] = 'R'; j++; }
			
			//G.. 
			if (!strcmp(tmp,"GTT")) { p[j] = 'V'; j++; }
			if (!strcmp(tmp,"GTC")) { p[j] = 'V'; j++; }
			if (!strcmp(tmp,"GTA")) { p[j] = 'V'; j++; }
			if (!strcmp(tmp,"GTG")) { p[j] = 'V'; j++; }
			
			if (!strcmp(tmp,"GCT")) { p[j] = 'A'; j++; }
			if (!strcmp(tmp,"GCC")) { p[j] = 'A'; j++; }
			if (!strcmp(tmp,"GCA")) { p[j] = 'A'; j++; }
			if (!strcmp(tmp,"GCG")) { p[j] = 'A'; j++; }
			
			if (!strcmp(tmp,"GAT")) { p[j] = 'D'; j++; }
			if (!strcmp(tmp,"GAC")) { p[j] = 'D'; j++; }
			if (!strcmp(tmp,"GAA")) { p[j] = 'E'; j++; }
			if (!strcmp(tmp,"GAG")) { p[j] = 'E'; j++; }
			
			if (!strcmp(tmp,"GGT")) { p[j] = 'G'; j++; }
			if (!strcmp(tmp,"GGC")) { p[j] = 'G'; j++; }
			if (!strcmp(tmp,"GGA")) { p[j] = 'G'; j++; }
			if (!strcmp(tmp,"GGG")) { p[j] = 'G'; j++; }	

			// IF not specified (in DeLong data marked as N)
			if (strstr(tmp,"N")) { p[j] = '0'; j++; }	


		}
	// end of for loop
	}
	p[j] = '\0';

}



void Complete(char* d, char* c, int N) {
	int i, j;
	for (i=0; i < N ; i++) {
		j = N -1 - i;
		switch (d[j]) {
			case 'A': c[i] = 'T'; break;
			case 'T': c[i] = 'A'; break;
			case 'G': c[i] = 'C'; break;
			case 'C': c[i] = 'G'; break;
		}
	}
	c[i] = '\0';
}