/* 
Massive Parallel Genetic Algorithm for NVIDIA Graphics Processing Units
Copyright (c) 2010, Piotr E. Srokosz, University of Warmia and Mazury in Olsztyn
All rights reserved.
*/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/timeb.h>
#include <time.h>
#include <memory.h>
#define _USE_MATH_DEFINES
#include <math.h>
#include <Windows.h>

/* ----------------------- TYPES & STRUCTURES ----------------------- */

struct Constants
{
	int MaxIter, TotalSize;
	unsigned short xoBits, xcBits, ycBits, sig0Bits;	
	double coh,	gam, wT, wR, wH, wE, CrossP, MutP, MigP;
	double maxxo, minxo, maxxc, minxc, maxyc, minyc, maxsig0, minsig0;
	double tanFi, tanBeta, tanAlfa, Tetag, TolErr;	
} Data;


#define WORK_PATH "H:\\MyWorks\\CUDAWork\\"
#define RNV_PATH "R:\\TEMP\\"
//#define RNV_PATH "H:\\MyWorks\\MTRNG\\"
//#define RNV_PATH "H:\\MyWorks\\MTRNG\\"

/* ------------------ GENERAL CONSOLE FUNCTIONS --------------------- */

/* clears the screen */
void 
clrscr( void ){	system( "cls" ); }

/* moves cursor to point (x,y) */
void 
gotoxy(int x, int y) 
{ 
	COORD dwCursorPosition; 
	HANDLE hConsoleOutput; 
	hConsoleOutput = GetStdHandle(STD_OUTPUT_HANDLE);
	dwCursorPosition.X = x; 
	dwCursorPosition.Y = y; 
	SetConsoleCursorPosition(hConsoleOutput,dwCursorPosition); 
} 

/* white on black - the default */
void 
SetTextDefault( void )
{
	HANDLE hConsoleOutput; 
	hConsoleOutput = GetStdHandle(STD_OUTPUT_HANDLE);
    SetConsoleTextAttribute ( hConsoleOutput,
								FOREGROUND_RED |
                                FOREGROUND_GREEN |
                                FOREGROUND_BLUE );
}

/* reverse - black on white */
void 
SetTextReverse( void )
{
	HANDLE hConsoleOutput; 
	hConsoleOutput = GetStdHandle(STD_OUTPUT_HANDLE);
    SetConsoleTextAttribute ( hConsoleOutput, 
                               BACKGROUND_RED |
                               BACKGROUND_GREEN |
                               BACKGROUND_BLUE );
}

/* text color */
void SetTextColor( int fColor, int bColor )
{
    int result;
	int fBright, bBright;

	HANDLE hConsoleOutput; 
	hConsoleOutput = GetStdHandle(STD_OUTPUT_HANDLE);
	if( fColor>15 ) fColor = 15;
	if( bColor>15 ) bColor = 15;
	if( fColor<0 ) fColor = 0;
	if( bColor<0 ) bColor = 0;
	if( fColor>7 ) { fColor -= 8; fBright = 1; } else { fBright = 0; }
	if( bColor>7 ) { bColor -= 8; bBright = 1; } else { bBright = 0; }
	result = fColor + ( fBright * 8 ) + ( bColor * 16 ) + ( bBright * 128 );
	SetConsoleTextAttribute ( hConsoleOutput, result );
}

/* clearing the line */
void
clrline( int x1, int x2, int y )
{
	int it;

	gotoxy( x1, y );
	for( it=x1; it<(x2+1); it++ ) printf(" ");
}

/* ------------------------- DEVICE FUNCTIONS ----------------------- */


/* Kernel GPU (genetic algorithm): creates starting population */
__global__ void MakePopulation_GPU( double* RNV, int strnv, struct Constants InputData,
								   unsigned short* xornd, unsigned short* xcrnd, unsigned short* ycrnd, unsigned short* sig0rnd )
{
	int i = threadIdx.x + ( blockIdx.x + blockIdx.y * gridDim.x ) * blockDim.x;
	int j;
	if( i < InputData.TotalSize )
	{
		j = 4 * i + strnv;
		xornd[i] = (unsigned short)rint( RNV[j] * ( pow( 2.0, InputData.xoBits ) - 1.0 ));
		xcrnd[i] = (unsigned short)rint( RNV[j+1] * ( pow( 2.0, InputData.xcBits ) - 1.0 ));
		ycrnd[i] = (unsigned short)rint( RNV[j+2] * ( pow( 2.0, InputData.ycBits ) - 1.0 ));
		sig0rnd[i] = (unsigned short)rint( RNV[j+3] * ( pow( 2.0, InputData.sig0Bits ) - 1.0 ));
	}
}

/* Kernel GPU (genetic algorithm): decodes variables */
__global__ void DecodePhenots_GPU( unsigned short* ivar1, unsigned short* ivar2, unsigned short* ivar3, unsigned short* ivar4, 
				struct Constants InputData, 
				double* res1, double* res2, double* res3, double* res4 )
{
	int i = threadIdx.x + ( blockIdx.x + blockIdx.y * gridDim.x ) * blockDim.x;
	unsigned short j, varRes1, varRes2, varRes3, varRes4;
	unsigned short maxBits1, maxBits2, maxBits3, maxBits4;

	if( i < InputData.TotalSize )
	{
		maxBits1 = InputData.xoBits - (unsigned short)1;
		maxBits2 = InputData.xcBits - (unsigned short)1;
		maxBits3 = InputData.ycBits - (unsigned short)1;
		maxBits4 = InputData.sig0Bits - (unsigned short)1;
		/* Binary Reflected Decoding */
		varRes1 = ( ivar1[i] & ( 1 << maxBits1 ));	
		varRes2 = ( ivar2[i] & ( 1 << maxBits2 ));	
		varRes3 = ( ivar3[i] & ( 1 << maxBits3 ));	
		varRes4 = ( ivar4[i] & ( 1 << maxBits4 ));	
		for( j=maxBits1; j>(unsigned short)0; j-- )
			varRes1 += (( varRes1 & ( 1 << j )) >> 1 ) ^ ( ivar1[i] & ( 1 << ( j - 1 )));

		for( j=maxBits2; j>(unsigned short)0; j-- )
			varRes2 += (( varRes2 & ( 1 << j )) >> 1 ) ^ ( ivar2[i] & ( 1 << ( j - 1 )));

		for( j=maxBits3; j>(unsigned short)0; j-- )
			varRes3 += (( varRes3 & ( 1 << j )) >> 1 ) ^ ( ivar3[i] & ( 1 << ( j - 1 )));

		for( j=maxBits4; j>(unsigned short)0; j-- )
			varRes4 += (( varRes4 & ( 1 << j )) >> 1 ) ^ ( ivar4[i] & ( 1 << ( j - 1 )));
				
		/* Natural Binary Re-decoding */
		res1[i] = InputData.minxo + ( (double)varRes1 / ( pow( 2.0, (double)InputData.xoBits ) - 1.0 )) * ( InputData.maxxo - InputData.minxo );
		res2[i] = InputData.minxc + ( (double)varRes2 / ( pow( 2.0, (double)InputData.xcBits ) - 1.0 )) * ( InputData.maxxc - InputData.minxc );
		res3[i] = InputData.minyc + ( (double)varRes3 / ( pow( 2.0, (double)InputData.ycBits ) - 1.0 )) * ( InputData.maxyc - InputData.minyc );
		res4[i] = InputData.minsig0 + ( (double)varRes4 / ( pow( 2.0, (double)InputData.sig0Bits ) - 1.0 )) * ( InputData.maxsig0 - InputData.minsig0 );
	}
}


/* Kernel GPU (slope stability): finds limit of integration */
__global__ void find_teta_GPU( double* xo, double* xc, double* yc,
				struct Constants InputData, double* Te ) 
{
	int i = threadIdx.x + ( blockIdx.x + blockIdx.y * gridDim.x ) * blockDim.x;
	int done = 0, j = 0;	
	double t1, e1, f1, f2, yo, dx, R0, Teta0, Rg, dTeta;
	double newTeta = 0.0, Teta = 1.57;

	if ( i < InputData.TotalSize )
	{		
		yo = xo[i] * InputData.tanAlfa;
		dx = xc[i] - xo[i];
		R0 = hypot(dx, ( yo - yc[i] ));
		Teta0 = acos( dx / R0 );
		Rg = xc[i] * cos( InputData.Tetag )- yc[i] * sin( InputData.Tetag );

		do
		{
			t1 = Teta - InputData.Tetag;
			e1 = exp( InputData.tanFi * ( Teta - Teta0 ));
			f1 = cos( t1 ) * e1;
			f2 = f1 * InputData.tanFi - sin( t1 ) * e1;
			f1 -= Rg / R0;
			newTeta = Teta - f1 / f2;
			dTeta = fabs( Teta - newTeta );
			Teta = newTeta; 
		
			// Checking accuracy
			if( dTeta < InputData.TolErr ) done = 1;
			j += 1;
		
			// Max iterations reached
			if( j > InputData.MaxIter ) done = 1;
		}while( done < 1 );
		Te[i] = Teta;
	}
}

/* Kernel GPU (slope stability): calculates R, T and Objective function (4 Gauss points) */
__global__  void RTO_GPU( double* xo, double* xc, double* yc, double* sig0, double* Tetae,
	        struct Constants InputData, double* R, double* T, double* Obj, double* Hcr, double* Err ) 
{
	int i = threadIdx.x + ( blockIdx.x + blockIdx.y * gridDim.x ) * blockDim.x;
	int j, k, m = 0, l, gran = 100;
	int maxit = InputData.MaxIter / 4; 
	double dTeta;
	double err, err_prev = 1.0E10, maxerr = 1.0E-4;
	double machz = 1.0E-9, newTeta;
	double Rv, Rh, M, Rv0, Rh0, M0, yo, dx, R0, cosT0, sinT0;
	double Teta0, dT, Re, eT0, t2F, ctFi, ThFi, NTFi;
	double Teta, Tstart, Tend, sinT, cosT, eT, r, y; 
	double yk, yp, sig, tau, x, dRv0, dRv, dRh0, dRh, dM0, dM;
	double xstart, xend, deltax, dT2, dX2, a, cTp, eTp, denom;
	double C1, abase, hcr;
	//double kappa, kappa0 = -0.06;
	double GP1 = -0.861136311, GP2 = -0.339981043, GP3 = 0.339981043,	GP4 = 0.861136311;
	double GPC1 = 0.347854854, GPC2 = 0.652145154, GPC3 = 0.652145154, GPC4 = 0.347854854;
	
	if( i < InputData.TotalSize )
	{
		yo = xo[i] * InputData.tanAlfa;
		dx = xc[i] - xo[i];
		R0 = hypot( dx, ( yo - yc[i] ));
		cosT0 = dx / R0;
		Teta0 = acos( cosT0 );
		dTeta = Tetae[i] - Teta0;
		Re = R0 * exp( InputData.tanFi * ( Tetae[i] - Teta0 ));
		sinT0 = sin( Teta0 );
		eT0 = exp( 2.0 * InputData.tanFi * Teta0 );
		t2F = 9.0 * InputData.tanFi * InputData.tanFi + 1.0;
		ctFi = InputData.coh / InputData.tanFi;
		ThFi = 3.0 * InputData.tanFi;
		NTFi = -2.0 * InputData.tanFi;
		C1 = eT0 * ( sig0[i] - InputData.gam * R0 * ( ThFi * cosT0 + sinT0 ) / t2F + ctFi );
		abase = exp( InputData.tanFi * Teta0 ) / R0;
		hcr = yc[i] + Re * sin( Tetae[i] );
		//kappa = 1.0 - sig0 / ( gam * hcr ) / kappa0;
	
		//gran = rint( dT / dTeta );
		dT = dTeta / (double)gran;

		l = 0;
		do
		{
			dT2 = dT / 2.0;
			Rv0 = 0.0; Rh0 = 0.0; M0 = 0.0;
			Tstart = Teta0;
			for( m=0; m<gran; m++)
			{		
				Tend = Tstart + dT;
				if( Tend > Tetae[i] ) Tend = Tetae[i];
				xstart = xc[i] - R0 * exp( InputData.tanFi * ( Tstart - Teta0 )) * cos( Tstart );
				xend = xc[i] - R0 * exp( InputData.tanFi * ( Tend - Teta0 )) * cos( Tend );
				deltax = xend - xstart;
				if( fabs( deltax ) < machz ) 
				{
					if( deltax < 0.0 ) deltax = -machz; else deltax = machz;
				}
		
				dX2 = deltax / 2.0;
		
				// Gauss integration 
				for( j=1; j<5; j++ )
				{
					if( j == 1 ){ Teta = dT2 * GP1; x = dX2 * GP1; }
					if( j == 2 ){ Teta = dT2 * GP2; x = dX2 * GP2; }
					if( j == 3 ){ Teta = dT2 * GP3; x = dX2 * GP3; }
					if( j == 4 ){ Teta = dT2 * GP4; x = dX2 * GP4; }
			
					Teta += ( Tstart + Tend ) / 2.0;
					x += ( xstart + xend ) / 2.0;
			
					a = abase * ( xc[i] - x );
					// precising Teta according to x
					k = 0;
					newTeta = Teta;
					do
					{
						cTp = cos( Teta );
						eTp = exp( InputData.tanFi * Teta );
						denom = eTp * ( InputData.tanFi * cTp - sin( Teta ));
						if( fabs( denom ) < machz )
						{
							if( denom < 0.0 ) denom = -machz; else denom = machz;
						}
						newTeta -= ( cTp * eTp - a ) / denom;
						if( fabs(Teta - newTeta) < InputData.TolErr ) k = InputData.MaxIter;
						k++;
						Teta = newTeta;
					}while( k < InputData.MaxIter );			
					sinT = sin( Teta );
					cosT = cos( Teta );
					eT = exp( NTFi * Teta );
					r = R0 * exp( InputData.tanFi * ( Teta - Teta0 ));
					y = yc[i] + r * sinT;
					sig = InputData.gam * r * ( ThFi * cosT + sinT ) / t2F - ctFi + C1 * eT;
					tau = sig * InputData.tanFi + InputData.coh;	
					if( x < 0.0 ) yk = InputData.tanAlfa * x; else yk = InputData.tanBeta * x;	
					denom = y - yc[i] - InputData.tanFi * ( xc[i] - x );			
					if( fabs( denom ) < machz )
					{
						if( denom < 0.0 ) denom = -machz; else denom = machz;
					}
					yp = ( InputData.tanFi * ( y - yc[i] ) + xc[i] - x ) / denom ;
					dRv0 = InputData.gam * ( y - yk ) - sig - tau * yp;
					dRh0 = sig * yp - tau;
					dM0 = dRh0 *  y - dRv0 * x;
					if( j == 1 )
					{
						dRv = GPC1 * dRv0;
						dRh = GPC1 * dRh0;
						dM = GPC1 * dM0;
					}
					if( j == 2 )
					{
						dRv += GPC2 * dRv0;
						dRh += GPC2 * dRh0;
						dM += GPC2 * dM0;
					}
					if( j == 3 )
					{
						dRv += GPC3 * dRv0;
						dRh += GPC3 * dRh0;
						dM += GPC3 * dM0;
					}
					if( j == 4 )
					{
						dRv += GPC4 * dRv0;
						dRh += GPC4 * dRh0;
						dM += GPC4 * dM0;
					}
				}//gauss interval integration

				Rv0 += dX2 * dRv;
				Rh0 += dX2 * dRh;
				M0 += dX2 * dM;
				Tstart = Tend;
			}//integration
		
			if( l==0 )
			{
				Rv = Rv0;
				Rh = Rh0;
				M = M0;
				err = 1.0;
			}
			else
			{
				err = fabs( Rv - Rv0 );
				err += fabs( Rh - Rh0 );
				err += fabs( M - M0 );
				err /= 3.0;
				Rv = Rv0;
				Rh = Rh0;
				M = M0;
			}
			if(( err < maxerr ) || ( err_prev < err ))
			{
				l = maxit;
			}
			else
			{
				gran *= 2;
				dT = dTeta / (double)gran;
				err_prev = err;
			}
			l++;
		}while( l < maxit );
		//step (gran) autoadaptation
		
		Err[i] = err;
		T[i] = Rv * xc[i] - Rh * yc[i] + M;
		R[i] = hypot( Rv, Rh );
		Hcr[i] = hcr;

		/* linear form */
		if( err < maxerr )
			Obj[i] = 1000.0 - InputData.wR * R[i] - InputData.wT * fabs( T[i] ) -  InputData.wH * hcr;
		else
			Obj[i] = 0.0;
		/* exponential form */
		//Obj[i] = pow( InputData.wR, R[i] ) + pow( InputData.wT, abs( T[i] )) + pow( InputData.wH, hcr ) + pow( InputData.wE, err );
		// protection 
		if( !isfinite(Obj[i]) ) Obj[i] = 0.0;
		if( Obj[i] < 0.0 ) Obj[i] = 0.0;
	}
}

/* Kernel GPU (ramdom numbers): recalculates RNV */
__global__  void RNV_GPU( unsigned short* RNVin, double* RNVout, int N ) 
{
	int i = threadIdx.x + ( blockIdx.x + blockIdx.y * gridDim.x ) * blockDim.x;
	if( i < N )
	{
		RNVout[i] = (double)RNVin[i] / 65535.0;
	}
}

/* Kernel GPU (genetic algorithm): statistics - average objective */
__global__ void avg_GPU( double* indata, double* outdata, int N ) 
{
	__shared__ double sdata[128];
	unsigned int tid = threadIdx.x;
	unsigned int j = blockIdx.x + blockIdx.y * gridDim.x;
	unsigned int i = tid + j * blockDim.x;
	double blocksize = (double)blockDim.x;
	unsigned int s;
	
	if( i < N )
	{
		sdata[tid] = indata[i];
		__syncthreads();
	
		for( s=1; s < blockDim.x; s *= 2) 
		{
			if (tid % (2*s) == 0) 
			{
				sdata[tid] += sdata[tid + s];
			}
			__syncthreads();
		}
	
		if (tid == 0) outdata[j] = sdata[0] / blocksize;
	}
}

/* Kernel GPU (genetic algorithm): statistics - minimum objective */
__global__ void min_GPU( double* indata, double* outdata, int N ) 
{
	__shared__ double sdata[128];
	unsigned int tid = threadIdx.x;
	unsigned int j = blockIdx.x + blockIdx.y * gridDim.x;
	unsigned int i = tid + j * blockDim.x;
	unsigned int s;

	if( i < N )
	{
		sdata[tid] = indata[i];
		__syncthreads();
	
		for( s=1; s < blockDim.x; s *= 2) 
		{
			if (tid % (2*s) == 0) 
			{
				if( sdata[tid] < sdata[tid + s] )
					sdata[tid + s] = sdata[tid];
				else
					sdata[tid] = sdata[tid + s];
			}
			__syncthreads();
		}

		if (tid == 0) outdata[j] = sdata[0];
	}
}

/* Kernel GPU (genetic algorithm): statistics - maximum objective */
__global__ void max_GPU( double* indata, double* outdata, int* ioutdata, int N ) 
{
	__shared__ double sdata[128];
	__shared__ int idata[128];
	unsigned int tid = threadIdx.x;
	unsigned int j = blockIdx.x + blockIdx.y * gridDim.x;
	unsigned int i = tid + j * blockDim.x;
	unsigned int s;

	if( i < N )
	{
		sdata[tid] = indata[i];
		idata[tid] = i;
		__syncthreads();
	
		for( s=1; s < blockDim.x; s *= 2) 
		{
			if (tid % (2*s) == 0) 
			{
				if( sdata[tid] > sdata[tid + s] )
				{
					sdata[tid + s] = sdata[tid];
					idata[tid + s] = idata[tid];
				}
				else
				{
					sdata[tid] = sdata[tid + s];
					idata[tid] = idata[tid + s];
				}
			}
			__syncthreads();
		}

		if (tid == 0) 
		{
			outdata[j] = sdata[0];
			ioutdata[j] = idata[0];
		}
	}
}


/* Kernel GPU (genetic algorithm): scale coefficients */
__global__ void ab_GPU( double* minObj, double* avgObj, double* maxObj, double* a, double* b, int M ) 
{	
	int j = blockIdx.x + blockIdx.y * gridDim.x;
	double fmult = 2.0;
	double check, delta1, delta2, a1, a2, b1, b2;

	if( j < M )
	{
		delta1 = maxObj[j] - avgObj[j];
		delta2 = avgObj[j] - minObj[j];
		a1 = avgObj[j] * ( fmult - 1.0 ) / delta1;
		b1 = avgObj[j] * ( maxObj[j] - fmult * avgObj[j] ) / delta1;
		a2 = avgObj[j] / delta2;
		b2 = -1.0 * minObj[j] * avgObj[j] / delta2;
		check = ( fmult * avgObj[j] - maxObj[j] ) / ( fmult - 1.0 );
	
		if( minObj[j] > check )
		{
			a[j] = a1;
			b[j] = b1;
		}
		else
		{
			a[j] = a2;
			b[j] = b2;
		}
	}
}


/* Kernel GPU (genetic algorithm): scales objective into fitness */
__global__ void O2F_GPU( double* Obj, double* a, double* b, double* Fit, int N )
{
	int j = blockIdx.x + blockIdx.y * gridDim.x;
	int i = threadIdx.x + j * blockDim.x;
	
	if( i < N )
	{
		Fit[i] = Obj[i] * a[j] + b[j];
	}
}


/* Kernel GPU (genetic algorithm): tournaments of individuals creating parental population */
__global__ void tournaments_GPU( double* RNV, int strnv, double* Fit, unsigned short* ixo, unsigned short* ixc, unsigned short* iyc, unsigned short* is0, 
								 unsigned short* jxo, unsigned short* jxc, unsigned short* jyc, unsigned short* js0, int N )
{
	int tid = threadIdx.x;
	int j = blockIdx.x + blockIdx.y * gridDim.x;
	int offs = j * blockDim.x;
	int i = tid + offs;
	int k, i1, i2, i3, i4, ip1, ip2, iwinner;
	double blocksize = (double) blockDim.x - 1.0;

	if( i < N )
	{	
		k = 4 * i + strnv;
		//choice of sub-population
		i1 = rint( RNV[k] * blocksize ) + offs;
		i2 = rint( RNV[k+1] * blocksize ) + offs;
		i3 = rint( RNV[k+2] * blocksize ) + offs;
		i4 = rint( RNV[k+3] * blocksize ) + offs;
		//tournaments
		if( Fit[i1] > Fit[i2] ) ip1 = i1; else ip1 = i2;
		if( Fit[i3] > Fit[i4] ) ip2 = i3; else ip2 = i4;
		if( Fit[ip1] > Fit[ip2] ) iwinner = ip1; else iwinner = ip2;
		//winner
		jxo[i] = ixo[iwinner];
		jxc[i] = ixc[iwinner];
		jyc[i] = iyc[iwinner];
		js0[i] = is0[iwinner];
	}
}


/* Kernel GPU (genetic algorithm): crossover of parents */
__global__ void crossover_GPU( double* RNV, unsigned short* usRNV, int strnv, double CrossP, unsigned short* ixo, unsigned short* ixc, unsigned short* iyc, unsigned short* is0, 
								 unsigned short* jxo, unsigned short* jxc, unsigned short* jyc, unsigned short* js0, int N )
{
	__shared__ unsigned short partner[256];
	int tid = threadIdx.x;
	int j = blockIdx.x + blockIdx.y * gridDim.x;
	int offs = j * blockDim.x;
	int halfsize = blockDim.x / 2;
	int i1 = tid + offs;
	int k, i2, ipartner;
	double blocksize = (double) blockDim.x - 1.0;
	unsigned short crossmat1, crossmat2;

	if( i1 < N )
	{	
		if( tid < halfsize )
		{
			k = 6 * i1 + strnv;
			//choice of partner individual
			i2 = rint( RNV[k] * blocksize ) + offs;
			ipartner = 4 * tid;
			//checking possibility of crossover
			if( RNV[k+1] < CrossP )
			{				
				crossmat1 = usRNV[k+2];
				crossmat2 = (unsigned short)65535 - crossmat1;
				jxo[i1] = ( ixo[i1] & crossmat1 ) + ( ixo[i2] & crossmat2 );
				partner[ipartner] = ( ixo[i1] & crossmat2 ) + ( ixo[i2] & crossmat1 );
				
				crossmat1 = usRNV[k+3];
				crossmat2 = (unsigned short)65535 - crossmat1;
				jxc[i1] = ( ixc[i1] & crossmat1 ) + ( ixc[i2] & crossmat2 );
				partner[ipartner + 1] = ( ixc[i1] & crossmat2 ) + ( ixc[i2] & crossmat1 );

				crossmat1 = usRNV[k+4];
				crossmat2 = (unsigned short)65535 - crossmat1;
				jyc[i1] = ( iyc[i1] & crossmat1 ) + ( iyc[i2] & crossmat2 );
				partner[ipartner + 2] = ( iyc[i1] & crossmat2 ) + ( iyc[i2] & crossmat1 );

				crossmat1 = usRNV[k+5];
				crossmat2 = (unsigned short)65535 - crossmat1;
				js0[i1] = ( is0[i1] & crossmat1 ) + ( is0[i2] & crossmat2 );
				partner[ipartner + 3] = ( is0[i1] & crossmat2 ) + ( is0[i2] & crossmat1 );
			}
			else
			{
				jxo[i1] = ixo[i1];
				partner[ipartner] = ixo[i2];
				
				jxc[i1] = ixc[i1];
				partner[ipartner + 1] = ixc[i2];

				jyc[i1] = iyc[i1];
				partner[ipartner + 2] = iyc[i2];

				js0[i1] = is0[i1];
				partner[ipartner + 3] = is0[i2];
			}
		}
		
		__syncthreads();
		
		if( tid >= halfsize )
		{
			ipartner = 4 * ( tid - halfsize );
			jxo[i1] = partner[ipartner];
			jxc[i1] = partner[ipartner + 1];
			jyc[i1] = partner[ipartner + 2];
			js0[i1] = partner[ipartner + 3];
		}

		__syncthreads();
	}
}


/* Kernel GPU (genetic algorithm): mutation of bits with probability MutP */
__global__ void mutation_GPU( double* RNV, int strnv, double MutP, unsigned short* ixo, unsigned short* ixc, unsigned short* iyc, unsigned short* is0, int N )
{
	int tid = threadIdx.x;
	int j = blockIdx.x + blockIdx.y * gridDim.x;
	int offs = j * blockDim.x;
	int i = tid + offs;
	int k, n;
	unsigned short matxo, matxc, matyc, mats0;

	if( i < N )
	{	
		matxo = (unsigned short)0;
		matxc = (unsigned short)0;
		matyc = (unsigned short)0;
		mats0 = (unsigned short)0;
		k = 64 * i + strnv;
		for( n=0; n<16; n++)
		{
			if( RNV[k+n] < MutP ) matxo += (unsigned short)rint( pow( 2.0, n ));
			if( RNV[k+n+16] < MutP ) matxc += (unsigned short)rint( pow( 2.0, n ));
			if( RNV[k+n+32] < MutP ) matyc += (unsigned short)rint( pow( 2.0, n ));
			if( RNV[k+n+48] < MutP ) mats0 += (unsigned short)rint( pow( 2.0, n ));
		}
		ixo[i] ^= matxo;
		ixc[i] ^= matxc;
		iyc[i] ^= matyc;
		is0[i] ^= mats0;
	}
}


/* Kernel GPU (genetic algorithm): mgration L-5 with probability of MigP */
__global__ void migrationL5_GPU( double* RNV, int strnv, double MigP, double* O, unsigned short* ixo, unsigned short* ixc, unsigned short* iyc, unsigned short* is0, int* Idx, int N ) 
{
	int tid = threadIdx.x;
	int bidx = blockIdx.x;
	int bidy = blockIdx.y;
	int gsx = gridDim.x;
	int gsy = gridDim.y;
	int bid = bidx + bidy * gsx;
	int bsize = blockDim.x;
	int offs = bid * bsize;
	int i = tid + offs;
	int jN, jE, jW, jS, bidyN, bidxE, bidxW, bidyS, j, k, n;
	int id[4];

	if( i < N )
	{
		bidxW = bidx - 1;
		bidxE = bidx + 1;
		if( bidxW < 0 ) bidxW = gsx - 1;
		if( bidxE > ( gsx - 1 ) ) bidxE = 0;

		bidyN = bidy - 1;
		bidyS = bidy + 1;
		if( bidyN < 0 ) bidyN = gsy - 1;
		if( bidyS > ( gsy - 1 ) ) bidyS = 0;
		
		jN = bidx + bidyN * gsx;
		jE = bidxE + bidy * gsx;
		jW = bidxW + bidy * gsx;
		jS = bidx + bidyS * gsx;

		id[0] = Idx[jN];
		id[1] = Idx[jE];
		id[2] = Idx[jW];
		id[3] = Idx[jS];

		k = 4 * i + strnv;

		//tournaments and replacings
		for( j=0; j<4; j++ )
		{
			n = id[j];
			if( RNV[k+j] < MigP )
			{
				//tournament
				if( O[i] < O[n] )
				{
					//replacing
					ixo[i] = ixo[n];
					ixc[i] = ixc[n];
					iyc[i] = iyc[n];
					is0[i] = is0[n];
					O[i] = O[n];
				}
			}
		}
	}
}


/* Kernel GPU (genetic algorithm): mgration L-9 with probability of MigP */
__global__ void migrationL9_GPU( double* RNV, int strnv, double MigP, double* O, unsigned short* ixo, unsigned short* ixc, unsigned short* iyc, unsigned short* is0, int* Idx, int N ) 
{
	int tid = threadIdx.x;
	int bidx = blockIdx.x;
	int bidy = blockIdx.y;
	int gsx = gridDim.x;
	int gsy = gridDim.y;
	int bid = bidx + bidy * gsx;
	int bsize = blockDim.x;
	int offs = bid * bsize;
	int i = tid + offs;
	int jN, jE, jW, jS, bidyN, bidxE, bidxW, bidyS, j, k, n;
	int jN2, jE2, jW2, jS2, bidyN2, bidxE2, bidxW2, bidyS2;
	int id[8];

	if( i < N )
	{
		bidxW = bidx - 1;
		bidxE = bidx + 1;
		if( bidxW < 0 ) bidxW = gsx - 1;
		if( bidxE > ( gsx - 1 ) ) bidxE = 0;

		bidyN = bidy - 1;
		bidyS = bidy + 1;
		if( bidyN < 0 ) bidyN = gsy - 1;
		if( bidyS > ( gsy - 1 ) ) bidyS = 0;

		bidxW2 = bidxW - 1;
		bidxE2 = bidxE + 1;
		if( bidxW2 < 0 ) bidxW2 = gsx - 1;
		if( bidxE2 > ( gsx - 1 ) ) bidxE2 = 0;

		bidyN2 = bidyN - 1;
		bidyS2 = bidyS + 1;
		if( bidyN2 < 0 ) bidyN2 = gsy - 1;
		if( bidyS2 > ( gsy - 1 ) ) bidyS2 = 0;
		
		jN = bidx + bidyN * gsx;
		jE = bidxE + bidy * gsx;
		jW = bidxW + bidy * gsx;
		jS = bidx + bidyS * gsx;

		jN2 = bidx + bidyN2 * gsx;
		jE2 = bidxE2 + bidy * gsx;
		jW2 = bidxW2 + bidy * gsx;
		jS2 = bidx + bidyS2 * gsx;

		id[0] = Idx[jN];
		id[1] = Idx[jE];
		id[2] = Idx[jW];
		id[3] = Idx[jS];

		id[4] = Idx[jN2];
		id[5] = Idx[jE2];
		id[6] = Idx[jW2];
		id[7] = Idx[jS2];

		k = 8 * i + strnv;

		//tournaments and replacings
		for( j=0; j<8; j++ )
		{
			n = id[j];
			if( RNV[k+j] < MigP )
			{
				//tournament
				if( O[i] < O[n] )
				{
					//replacing
					ixo[i] = ixo[n];
					ixc[i] = ixc[n];
					iyc[i] = iyc[n];
					is0[i] = is0[n];
					O[i] = O[n];
				}
			}
		}
	}
}


/* Kernel GPU (genetic algorithm): mgration C-9 with probability of MigP */
__global__ void migrationC9_GPU( double* RNV, int strnv, double MigP, double* O, unsigned short* ixo, unsigned short* ixc, unsigned short* iyc, unsigned short* is0, int* Idx, int N ) 
{
	int tid = threadIdx.x;
	int bidx = blockIdx.x;
	int bidy = blockIdx.y;
	int gsx = gridDim.x;
	int gsy = gridDim.y;
	int bid = bidx + bidy * gsx;
	int bsize = blockDim.x;
	int offs = bid * bsize;
	int i = tid + offs;
	int jN, jE, jW, jS, jNE, jSE, jNW, jSW;
	int bidyN, bidxE, bidxW, bidyS, j, k, n;
	int id[8];

	if( i < N )
	{
		bidxW = bidx - 1;
		bidxE = bidx + 1;
		if( bidxW < 0 ) bidxW = gsx - 1;
		if( bidxE > ( gsx - 1 ) ) bidxE = 0;

		bidyN = bidy - 1;
		bidyS = bidy + 1;
		if( bidyN < 0 ) bidyN = gsy - 1;
		if( bidyS > ( gsy - 1 ) ) bidyS = 0;
		
		jN = bidx + bidyN * gsx;
		jE = bidxE + bidy * gsx;
		jW = bidxW + bidy * gsx;
		jS = bidx + bidyS * gsx;

		jNE = bidxE + bidyN * gsx;
		jSE = bidxE + bidyS * gsx;
		jNW = bidxW + bidyN * gsx;
		jSW = bidxW + bidyS * gsx;

		id[0] = Idx[jN];
		id[1] = Idx[jE];
		id[2] = Idx[jW];
		id[3] = Idx[jS];

		id[4] = Idx[jNE];
		id[5] = Idx[jSE];
		id[6] = Idx[jNW];
		id[7] = Idx[jSW];

		k = 8 * i + strnv;

		//tournaments and replacings
		for( j=0; j<8; j++ )
		{
			n = id[j];
			if( RNV[k+j] < MigP )
			{
				//tournament
				if( O[i] < O[n] )
				{
					//replacing
					ixo[i] = ixo[n];
					ixc[i] = ixc[n];
					iyc[i] = iyc[n];
					is0[i] = is0[n];
					  O[i] = O[n];
				}
			}
		}		
	}
}


/* Kernel GPU (genetic algorithm): mgration LC-13 with probability of MigP */
__global__ void migrationLC13_GPU( double* RNV, int strnv, double MigP, double* O, unsigned short* ixo, unsigned short* ixc, unsigned short* iyc, unsigned short* is0, int* Idx, int N ) 
{
	int tid = threadIdx.x;
	int bidx = blockIdx.x;
	int bidy = blockIdx.y;
	int gsx = gridDim.x;
	int gsy = gridDim.y;
	int bid = bidx + bidy * gsx;
	int bsize = blockDim.x;
	int offs = bid * bsize;
	int i = tid + offs;
	int jN, jE, jW, jS, bidyN, bidxE, bidxW, bidyS, j, k, n;
	int jNE, jSE, jNW, jSW;
	int jN2, jE2, jW2, jS2, bidyN2, bidxE2, bidxW2, bidyS2;
	int id[12];

	if( i < N )
	{
		bidxW = bidx - 1;
		bidxE = bidx + 1;
		if( bidxW < 0 ) bidxW = gsx - 1;
		if( bidxE > ( gsx - 1 ) ) bidxE = 0;

		bidyN = bidy - 1;
		bidyS = bidy + 1;
		if( bidyN < 0 ) bidyN = gsy - 1;
		if( bidyS > ( gsy - 1 ) ) bidyS = 0;

		bidxW2 = bidxW - 1;
		bidxE2 = bidxE + 1;
		if( bidxW2 < 0 ) bidxW2 = gsx - 1;
		if( bidxE2 > ( gsx - 1 ) ) bidxE2 = 0;

		bidyN2 = bidyN - 1;
		bidyS2 = bidyS + 1;
		if( bidyN2 < 0 ) bidyN2 = gsy - 1;
		if( bidyS2 > ( gsy - 1 ) ) bidyS2 = 0;
		
		jN = bidx + bidyN * gsx;
		jE = bidxE + bidy * gsx;
		jW = bidxW + bidy * gsx;
		jS = bidx + bidyS * gsx;

		jNE = bidxE + bidyN * gsx;
		jSE = bidxE + bidyS * gsx;
		jNW = bidxW + bidyN * gsx;
		jSW = bidxW + bidyS * gsx;

		jN2 = bidx + bidyN2 * gsx;
		jE2 = bidxE2 + bidy * gsx;
		jW2 = bidxW2 + bidy * gsx;
		jS2 = bidx + bidyS2 * gsx;

		id[0] = Idx[jN];
		id[1] = Idx[jE];
		id[2] = Idx[jW];
		id[3] = Idx[jS];

		id[4] = Idx[jNE];
		id[5] = Idx[jSE];
		id[6] = Idx[jNW];
		id[7] = Idx[jSW];

		id[8] = Idx[jN2];
		id[9] = Idx[jE2];
		id[10] = Idx[jW2];
		id[11] = Idx[jS2];

		k = 12 * i + strnv;

		//tournaments and replacings
		for( j=0; j<12; j++ )
		{
			n = id[j];
			if( RNV[k+j] < MigP )
			{
				//tournament
				if( O[i] < O[n] )
				{
					//replacing
					ixo[i] = ixo[n];
					ixc[i] = ixc[n];
					iyc[i] = iyc[n];
					is0[i] = is0[n];
					O[i] = O[n];
				}
			}
		}
	}
}


/* Kernel CPU (IO operations): saves headers in files */
int
init_CPU( void )
{
	FILE *inoutfile;
	char path[255];
	
	//results contener
	strcpy( path, WORK_PATH );
	strcat( path, "results.txt" );
	if( (inoutfile = fopen( path, "wt" )) == NULL )
	{
		printf( "Cannot create results file.\n" );
		return 1;
	}
	fprintf(inoutfile,"Gener.     iGA       Obj   minObj   avgObj   maxObj        R        T        H        e       xo       xc       yc     sig0\n" );
	if( inoutfile )
	{
		if ( fclose( inoutfile ) )
		{
			printf( "The results file was not closed.\n" );
			return 1;
		}
	}

	//best results contener
	strcpy( path, WORK_PATH );
	strcat( path, "best.txt" );
	if( (inoutfile = fopen( path, "wt" )) == NULL )
	{
		printf( "Cannot create best results file.\n" );
		return 1;
	}
	fprintf(inoutfile,"Gener.     minObj   avgObj   maxObj        R        T        H        e       xo       xc       yc     sig0\n" );
	if( inoutfile )
	{
		if ( fclose( inoutfile ) )
		{
			printf( "The best results file was not closed.\n" );
			return 1;
		}
	}

	//population contener
	strcpy( path, WORK_PATH );
	strcat( path, "populations.txt" );
	if( (inoutfile = fopen( path, "wt" )) == NULL )
	{
		printf( "Cannot create populations file.\n" );
		return 1;
	}
	fprintf(inoutfile,"Gener.     iGA  iPop       Obj        R        T        H        e       xo       xc       yc     sig0\n" );
	if( inoutfile )
	{
		if ( fclose( inoutfile ) )
		{
			printf( "The populations file was not closed.\n" );
			return 1;
		}
	}

	printf("Slope Stability + GA (CUDA) v.1.1, P.Srokosz, 2010.\n");
	
	return 0;
}

/* Kernel CPU (IO operations): reads random numbers */
int
read_RNV( int cntr, unsigned short* uRNV, int maxsize )
{
	int packsize;
	FILE *inoutfile;
	char buf[10], path[255];
		
	//reading a vector of true random numbers
	itoa( cntr, buf, 10 );
	strcpy( path, RNV_PATH );
	strcat( path, "Uniform" );
	strcat( path, buf );
	strcat( path, ".b16" );
	if(( inoutfile = fopen( path, "rb" )) == NULL ) return -1;
	if( inoutfile )
	{
		packsize = fread( uRNV, sizeof(unsigned short), maxsize, inoutfile );
		if ( fclose( inoutfile ) ) return -2;
	}
	return packsize;
}


/* Kernel CPU (genetic algorithm): saving best individuals and statistics */
void
save_CPU( int itr, double* R, double* T, double* O, double* H, double* Err,
		  double* Oavg, double* Omin, double* Omax, 
		  double* xo, double* xc, double* yc, double* sig0, double *Hcr, double *Lcr, double *Ocr, double *Rcr, double *Tcr,
		  const int sizeGA, const int sizePop )
{
	FILE *outfile, *bestfile;
	int iga, jpop, pos, iBest;
	double maxObj, Best = 0.0, Avg, Min, Max, Rb, Tb, Hb, xob, xcb, ycb, sig0b, Errb;
	char path[255];

	strcpy( path, WORK_PATH );
	strcat( path, "results.txt" );
	if( (outfile = fopen( path, "at" )) == NULL )
	{
		printf( "Cannot open results file.\n" );
		return;
	}

	strcpy( path, WORK_PATH );
	strcat( path, "best.txt" );
	if( (bestfile = fopen( path, "at" )) == NULL )
	{
		printf( "Cannot open best results file.\n" );
		fclose( outfile );
		return;
	}

	for(iga = 0; iga < sizeGA; iga++)
	{
		maxObj = 0.0;
		for(jpop = 0; jpop < sizePop; jpop++)
		{
			pos = jpop + iga * sizePop;
			if( O[pos] > maxObj )
			{
				maxObj = O[pos];
				iBest = pos;
			}
		}
		if( Best < maxObj )
		{
			Best = maxObj;
			Avg = Oavg[iga];
			Min = Omin[iga];
			Max = Omax[iga];
			Rb = R[iBest];
			Tb = T[iBest];
			Hb = H[iBest];
			xob = xo[iBest];
			xcb = xc[iBest];
			ycb = yc[iBest];
			sig0b = sig0[iBest];
			Errb = Err[iBest];
		}
		fprintf(outfile,"Gen: %.3d i: %.3d %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.5f %8.3f %8.3f %8.3f %8.3f\n", itr, iga, O[iBest], Omin[iga], Oavg[iga], Omax[iga], R[iBest], T[iBest], H[iBest], Err[iBest], xo[iBest], xc[iBest], yc[iBest], sig0[iBest] );
	}
	fprintf(bestfile,"Gen: %.3d %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.5f %8.3f %8.3f %8.3f %8.3f\n", itr, Min, Avg, Max, Rb, Tb, Hb, Errb, xob, xcb, ycb, sig0b );

	*Hcr = Hb;
	*Lcr = xob;
	*Ocr = Best;
	*Rcr = Rb;
	*Tcr = Tb;

	if( outfile )
	{
		if ( fclose( outfile ) )
		{
			printf( "The results file was not closed.\n" );
		}
	}

	if( bestfile )
	{
		if ( fclose( bestfile ) )
		{
			printf( "The best results file was not closed.\n" );
		}
	}
}

/* Kernel CPU (genetic algorithm): saving all populations and statistics */
void
saveall_CPU( int itr, double* R, double* T, double* O, double* H, 
		     double* xo, double* xc, double* yc, double* sig0,
		     const int sizeGA, const int sizePop )
{
	FILE *outfile;
	int iga, jpop, pos;
	char path[255];
	
	strcpy( path, WORK_PATH );
	strcat( path, "populations.txt" );
	if( (outfile = fopen( path, "at" )) == NULL )
	{
		printf( "Cannot open populations file.\n" );
		return;
	}

	for( iga = 0; iga < sizeGA; iga++ )
	{
		for( jpop = 0; jpop < sizePop; jpop++ )
		{
			pos = jpop + iga * sizePop;
			fprintf(outfile,"Gen: %.3d i: %.3d j: %.3d %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f\n", itr, iga, jpop, O[pos], R[pos], T[pos], H[pos], xo[pos], xc[pos], yc[pos], sig0[pos] );
		}
	}

	if( outfile )
	{
		if ( fclose( outfile ) )
		{
			printf( "The populations file was not closed.\n" );
		}
	}
}


/* Main code */
int main(int argc, char* argv[])
{



	/* -------------------- CONTROL -------------------- */
	/* algorithm = -1 : random searching
	   algorithm = 0 : GA
	   algorithm = 1 : PGA
	*/
	int algorithm = 1;


	/* -------------------- DATA -------------------- */
	
	int maxPop = 128; //HAS TO BE POWER OF 2 and multiple of 32 and not greater than 128!
	int secGAx = 16; //width of GA in toroidal structure
	int secGAy = 16; //height of GA in toroidal structure
	int typGA = 5; //type of migration structure: 5 - Linear5, 9 - Linear9, 10 - Compact9, 13 - Combined13
	int maxGen = 200;
	int Cycle = 40;
	int maxGA = secGAx * secGAy;
	
	double Alfa = 0.0 * M_PI / 180.0;
	double Beta = 70.0 * M_PI  / 180.0; 
	double Fi = 20.0 * M_PI / 180.0;
	Data.coh = 10.0;
	Data.gam = 16.0;
	Data.wT = 0.4;
	Data.wR = 0.5;
	Data.wH = 0.3;
	//not used
	Data.wE = 1.0;
	/*
	Data.wT = 0.88;
	Data.wR = 0.88;
	Data.wH = 0.95;
	Data.wE = 1.0;
	*/
	
	Data.maxxo = -1.0;
	Data.minxo = -6.11;
	Data.maxxc = 6.11;
	Data.minxc = 1.0;
	Data.maxyc = -1.0;
	Data.minyc = -6.11;
	Data.maxsig0 = -2.0;
	Data.minsig0 = -12.23;

	Data.xoBits = (unsigned short)9;
	Data.xcBits = (unsigned short)9;
	Data.ycBits = (unsigned short)9;
	Data.sig0Bits = (unsigned short)10;
	
	Data.CrossP = 1.0;
	Data.MutP = 0.005;
	Data.MigP = 1.0/(double)maxPop; //individuals per population
	
	Data.TolErr = 1.0E-9;
	Data.MaxIter = 20;

	Data.tanFi = tan( Fi );
	Data.tanBeta = tan( Beta );
	Data.tanAlfa = tan( Alfa );
	Data.Tetag = M_PI / 2.0 - Beta;	
	Data.TotalSize = maxGA * maxPop;

	char yStart = 2;//display info



	/* -------------------- VARIABLES -------------------- */
	
	double Hres, Lres, Ores, Rres, Tres, Hbest, Lbest, Obest = 0.0, Rbest, Tbest;
	int iGen, jGen, iRNV, iCycle, reStart, rnvofs, sizeRNV, maxGenRNV;
	clock_t start, finish, delay1, delay2;
	cudaError_t cudaErr;
	char path[255];
	FILE *infofile, *timingfile;

	/* Dimensions */
	dim3 ThreadsPerBlock(maxPop, 1, 1);
	dim3 BlocksPerGrid(secGAx, secGAy, 1);
	dim3 ThreadsPerBlockRNV(128, 1, 1);
	dim3 BlocksPerGridRNV(512, 512, 1);
	int maxRNV = 33554432;//512*512*128
	int maxRNVpGen;
	size_t size2Ddbl = Data.TotalSize * sizeof(double);
	size_t size2Dush = Data.TotalSize * sizeof(unsigned short);
	size_t sizeRNVdbl =  maxRNV * sizeof(double);
	size_t sizeRNVush =  maxRNV * sizeof(unsigned short);
	size_t size1Ddbl = maxGA * sizeof(double);
	size_t size1Dint = maxGA * sizeof(int);
	
	/* Creating CPU working variables */
    double* zero_h;
	double* xo_h; double* xc_h; double* yc_h; double* sig0_h;
	double* T_h; double* R_h; double* H_h; double* O_h; double* Err_h;
	unsigned short* usRNV_h; double* dRNV_h;
	double* Omin_h;	double* Oavg_h;	double* Omax_h;

	/* Creating GPU working variables */
	double* zero_d;
    double* xo_d; double* xc_d;	double* yc_d; double* sig0_d;
	double* Texit_d; double* T_d; double* R_d; double* H_d; double* O_d;
	unsigned short* usRNV_d; double* dRNV_d;
	unsigned short* ixo_d; unsigned short* ixc_d; 
	unsigned short* iyc_d; unsigned short* isig0_d;
	unsigned short* jxo_d; unsigned short* jxc_d; 
	unsigned short* jyc_d; unsigned short* jsig0_d;
	double* Omin_d;	double* Oavg_d;	double* Omax_d;
	double* a_d; double* b_d; double* F_d; double* Err_d;
	int* Idx_d;
		
	/* Allocating CPU working variables in host memory */
	zero_h = (double *)malloc(size1Ddbl);	
	xo_h = (double *)malloc(size2Ddbl);
	xc_h = (double *)malloc(size2Ddbl);
	yc_h = (double *)malloc(size2Ddbl);
	sig0_h = (double *)malloc(size2Ddbl);
	T_h = (double *)malloc(size2Ddbl);
	R_h = (double *)malloc(size2Ddbl);
	H_h = (double *)malloc(size2Ddbl);
	O_h = (double *)malloc(size2Ddbl);
	Err_h = (double *)malloc(size2Ddbl);
	usRNV_h = (unsigned short *)malloc(sizeRNVush);
	dRNV_h = (double *)malloc(sizeRNVdbl);	
	Oavg_h = (double *)malloc(size1Ddbl);
	Omin_h = (double *)malloc(size1Ddbl);
	Omax_h = (double *)malloc(size1Ddbl);
	
	/* Allocate GPU working variables in device memory */
	cudaMalloc((void**)&zero_d, size1Ddbl);
	cudaMalloc((void**)&xo_d, size2Ddbl);
	cudaMalloc((void**)&xc_d, size2Ddbl);
	cudaMalloc((void**)&yc_d, size2Ddbl);
	cudaMalloc((void**)&sig0_d, size2Ddbl);
	cudaMalloc((void**)&Texit_d, size2Ddbl);
	cudaMalloc((void**)&T_d, size2Ddbl);
	cudaMalloc((void**)&R_d, size2Ddbl);
	cudaMalloc((void**)&H_d, size2Ddbl);
	cudaMalloc((void**)&O_d, size2Ddbl);
	cudaMalloc((void**)&Err_d, size2Ddbl);
	cudaMalloc((void**)&usRNV_d, sizeRNVush);
	cudaMalloc((void**)&dRNV_d, sizeRNVdbl);
	cudaMalloc((void**)&ixo_d, size2Dush);
	cudaMalloc((void**)&ixc_d, size2Dush);
	cudaMalloc((void**)&iyc_d, size2Dush);
	cudaMalloc((void**)&isig0_d, size2Dush);
	cudaMalloc((void**)&jxo_d, size2Dush);
	cudaMalloc((void**)&jxc_d, size2Dush);
	cudaMalloc((void**)&jyc_d, size2Dush);
	cudaMalloc((void**)&jsig0_d, size2Dush);
	cudaMalloc((void**)&Omin_d, size1Ddbl);
	cudaMalloc((void**)&Oavg_d, size1Ddbl);
	cudaMalloc((void**)&Omax_d, size1Ddbl);
	cudaMalloc((void**)&a_d, size1Ddbl);
	cudaMalloc((void**)&b_d, size1Ddbl);
	cudaMalloc((void**)&F_d, size2Ddbl);
	cudaMalloc((void**)&Idx_d, size1Dint);



	/* -------------------- INITIALIZATIONS -------------------- */

	/* Initializing CPU operations, reading random numbers */
	if( init_CPU() != 0 ) 
	{
		SetTextColor( 12, 0 );
		printf("Cannot initialize IO.");
		goto cleanup;
	}
	strcpy( path, WORK_PATH );
	strcat( path, "timing2.txt" );
	if( (infofile = fopen( path, "wt" )) == NULL )
	{
		SetTextColor( 12, 0 );
		printf( "Cannot create timing2 file." );
		goto cleanup;
	}
	fprintf(infofile,"     Hcr[m]      xo[m]       t[s] (the best ever found)(GAx=%d GAy=%d Pop=%d Alg=%d LC=%d)\n", secGAx, secGAy, maxPop, algorithm, typGA);

	strcpy( path, WORK_PATH );
	strcat( path, "timing.txt" );
	if( (timingfile = fopen( path, "wt" )) == NULL )
	{
		SetTextColor( 12, 0 );
		printf( "Cannot create timing file.\n" );
		goto cleanup;
	}
	fprintf(timingfile,"     Hcr[m]      xo[m]       t[s] (actually the best)(GAx=%d GAy=%d Pop=%d Alg=%d LC=%d)\n", secGAx, secGAy, maxPop, algorithm, typGA);

	if(( typGA != 5 ) && ( typGA != 9 ) && ( typGA != 10 ) && ( typGA != 13 ))
	{
		SetTextColor( 12, 0 );
		printf("typGA=%d - not recognized!", typGA);
		goto cleanup;
	}

	/* zeroing working data */
	for( iGen=0; iGen<maxRNV; iGen++ )
	{ 
		usRNV_h[iGen] = (unsigned short) 0;
		dRNV_h[iGen] = 0.0;
	}
	for( iGen=0; iGen<maxGA; iGen++ ){ zero_h[iGen] = 0.0; }
	cudaMemcpy(zero_d, zero_h, size1Ddbl, cudaMemcpyHostToDevice);
	cudaErr = cudaGetLastError();
	if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
	cudaThreadSynchronize();

	SetTextColor( 7, 0 );
	gotoxy( 0, yStart );
	printf("GAx = %d GAy = %d Pop = %d Alg = %d LC = %d MaxGen = %d", secGAx, secGAy, maxPop, algorithm, typGA, maxGen );


	/* -------------------- CALCULATIONS -------------------- */

	iGen = 0;
	iRNV = 0;
	iCycle = 1;
	reStart = 1;
	start = clock();

reRNV:

	delay1 = clock();

	/* Reading RNV from the file */
	iRNV++;
	SetTextColor( 11, 0 );
	gotoxy( 0, 2 + yStart );
	printf("RNV(%.3d) : ", iRNV);
	sizeRNV = read_RNV( iRNV, usRNV_h, maxRNV );
	if( sizeRNV <= 0 ) { SetTextColor( 12, 0 ); printf( "Error reading RNV!" ); goto cleanup; }
	sizeRNVush = sizeRNV * sizeof( unsigned short );
	sizeRNVdbl = sizeRNV * sizeof( double );

	delay2 = clock();
	start += (delay2 - delay1);

	if( algorithm == -1 ) maxRNVpGen = 4 * Data.TotalSize; 
	if( algorithm == 0 ) maxRNVpGen = 78 * Data.TotalSize; 
	if( algorithm == 1 ) 
	{
		if( typGA == 5 ) maxRNVpGen = 82 * Data.TotalSize;
		if(( typGA == 9 ) || ( typGA == 10 )) maxRNVpGen = 86 * Data.TotalSize;
		if( typGA == 13 ) maxRNVpGen = 90 * Data.TotalSize;
	}
	maxGenRNV = (int)floor( (double)sizeRNV / (double)maxRNVpGen );
	printf("MRN(%.8d) : MGE(%.3d) : RNV(%.8d) : ", maxRNVpGen, maxGenRNV, sizeRNV );
	if( maxGenRNV < 1 ) goto cleanup;

	/* Recalculation of RNV to doubles */	
	printf("CNV : ");
	cudaMemcpy(usRNV_d, usRNV_h, sizeRNVush, cudaMemcpyHostToDevice);
	cudaErr = cudaGetLastError();
	if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
	cudaThreadSynchronize();
	
	RNV_GPU<<<BlocksPerGridRNV, ThreadsPerBlockRNV>>>( usRNV_d, dRNV_d, sizeRNV );
	cudaErr = cudaGetLastError();
	if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
	cudaThreadSynchronize();

	/* Iterations using read package of TRN */
	rnvofs = 0;//start point for RN vector
	jGen = 0;
	do
	{
		/* Making starting population (genotypes), uses 4 trn per individual */
		if((( algorithm > -1 ) && ( reStart == 1 )) || ( algorithm == -1 ))
		{
			SetTextColor( 11, 0 );
			gotoxy( 60, 2 + yStart );
			if( algorithm == -1 ) printf("MAK(%.3d)", iGen); else printf("MAK(%.3d)", iCycle);
			MakePopulation_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( dRNV_d, rnvofs, Data, ixo_d, ixc_d, iyc_d, isig0_d );
			cudaErr = cudaGetLastError();
			if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
			cudaThreadSynchronize();
			rnvofs += Data.TotalSize * 4;
			reStart = 0;
		}

		/* Decoding variables (phenotypes) */
		gotoxy( 0, 3 + yStart );
		SetTextColor( 10, 0 );
		printf("ITR(%.3d) : ", iGen); 
		SetTextColor( 14, 0 );
		printf("DEC : ");
		DecodePhenots_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( ixo_d, ixc_d, iyc_d, isig0_d, Data, xo_d, xc_d, yc_d, sig0_d );
		cudaErr = cudaGetLastError();
		if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
		cudaThreadSynchronize();
	
		/* Finding limits of integration */
		printf("LIM : ");
		find_teta_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( xo_d, xc_d, yc_d, Data, Texit_d );
		cudaErr = cudaGetLastError();
		if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
		cudaThreadSynchronize();
		
		/* Evaluating objectives */
		printf("OBJ : ");
		RTO_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( xo_d, xc_d, yc_d, sig0_d, Texit_d, Data, R_d, T_d, O_d, H_d, Err_d ); 
		cudaErr = cudaGetLastError();
		if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
		cudaThreadSynchronize();
	
		/* Statistics */
		printf("STA : ");
		avg_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( O_d, Oavg_d, Data.TotalSize ); 
		cudaErr = cudaGetLastError();
		if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
		cudaThreadSynchronize();
		min_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( O_d, Omin_d, Data.TotalSize ); 
		cudaErr = cudaGetLastError();
		if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
		cudaThreadSynchronize();
		max_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( O_d, Omax_d, Idx_d, Data.TotalSize ); 
		cudaErr = cudaGetLastError();
		if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
		cudaThreadSynchronize();

		/* Copy results from device memory to host memory */
		printf("COP : ");
		cudaMemcpy(xo_h, xo_d, size2Ddbl, cudaMemcpyDeviceToHost);
		cudaMemcpy(xc_h, xc_d, size2Ddbl, cudaMemcpyDeviceToHost);
		cudaMemcpy(yc_h, yc_d, size2Ddbl, cudaMemcpyDeviceToHost);
		cudaMemcpy(sig0_h, sig0_d, size2Ddbl, cudaMemcpyDeviceToHost);
		cudaErr = cudaGetLastError();
		if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
		cudaThreadSynchronize();
		cudaMemcpy(R_h, R_d, size2Ddbl, cudaMemcpyDeviceToHost);
		cudaMemcpy(T_h, T_d, size2Ddbl, cudaMemcpyDeviceToHost);
		cudaMemcpy(H_h, H_d, size2Ddbl, cudaMemcpyDeviceToHost);
		cudaMemcpy(O_h, O_d, size2Ddbl, cudaMemcpyDeviceToHost);
		cudaMemcpy(Err_h, Err_d, size2Ddbl, cudaMemcpyDeviceToHost);
		cudaMemcpy(Oavg_h, Oavg_d, size1Ddbl, cudaMemcpyDeviceToHost);
		cudaMemcpy(Omin_h, Omin_d, size1Ddbl, cudaMemcpyDeviceToHost);
		cudaMemcpy(Omax_h, Omax_d, size1Ddbl, cudaMemcpyDeviceToHost);
		cudaErr = cudaGetLastError();
		if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
		cudaThreadSynchronize();

		/* Saving results */
		printf("SAV : ");
		save_CPU( iGen, R_h, T_h, O_h, H_h, Err_h, Oavg_h, Omin_h, Omax_h, xo_h, xc_h, yc_h, sig0_h, &Hres, &Lres, &Ores, &Rres, &Tres, maxGA, maxPop );
		//saveall_CPU( iGen, R_h, T_h, O_h, H_h, xo_h, xc_h, yc_h, sig0_h, maxGA, maxPop );
		if( Ores > Obest ) { Obest = Ores; Hbest = Hres; Lbest = Lres; Rbest = Rres; Tbest = Tres; }
		SetTextColor( 0, 15 );
		printf("OK");
		
		finish = clock();

		/* the best results ever found */
		SetTextColor( 13, 0 );
		gotoxy( 0, 7 + yStart );
		printf("xo = %6.3lf : Hcr = %5.3lf : R = %6.3lf : T = %6.3lf", Lbest, Hbest, Rbest, Tbest );
		fprintf( infofile, " %10.3lf %10.3lf %10.3lf\n", Hbest, Lbest, (double)(finish - start) / (double)CLOCKS_PER_SEC );
		/* the best results in actual population */
		SetTextColor( 12, 0 );
		gotoxy( 0, 7 + yStart + iCycle );
		printf("xo = %6.3lf : Hcr = %5.3lf : R = %6.3lf : T = %6.3lf", Lres, Hres, Rres, Tres );
		fprintf( timingfile, " %10.3lf %10.3lf %10.3lf\n", Hres, Lres, (double)(finish - start) / (double)CLOCKS_PER_SEC );

		/* time of computations */
		gotoxy( 0, 6 + yStart );
		SetTextColor( 11, 0 );
		printf("TIM : %.3lf sec.", (double)(finish - start) / (double)CLOCKS_PER_SEC );
		
		if( jGen == maxGenRNV ) break;
		if( iGen == maxGen ) break;

		if( algorithm == 1 )
		{
			/* Migration */
			gotoxy( 0, 4 + yStart );
			SetTextColor( 10, 0 );
			printf("PGA(%.3d) : ", iGen );
			SetTextColor( 14, 0 );
			printf("MIG : ");
			if( typGA == 5 )
			{
				printf("OL5 : ");		
				migrationL5_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( dRNV_d, rnvofs, Data.MigP, O_d, ixo_d, ixc_d, iyc_d, isig0_d, Idx_d, Data.TotalSize ); 
			}
			if( typGA == 9 )
			{
				printf("OL9 : ");		
				migrationL9_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( dRNV_d, rnvofs, Data.MigP, O_d, ixo_d, ixc_d, iyc_d, isig0_d, Idx_d, Data.TotalSize ); 
			}
			if( typGA == 10 )
			{
				printf("OC9 : ");		
				migrationC9_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( dRNV_d, rnvofs, Data.MigP, O_d, ixo_d, ixc_d, iyc_d, isig0_d, Idx_d, Data.TotalSize ); 
			}
			if( typGA == 13 )
			{
				printf("X13 : ");		
				migrationLC13_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( dRNV_d, rnvofs, Data.MigP, O_d, ixo_d, ixc_d, iyc_d, isig0_d, Idx_d, Data.TotalSize ); 
			}
			cudaErr = cudaGetLastError();
			if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
			cudaThreadSynchronize();
			rnvofs += Data.TotalSize * 4;

			SetTextColor( 0, 15 );
			printf("OK");
		}
		else
		{
			gotoxy( 0, 4 + yStart );
			SetTextColor( 10, 0 );
			printf("PGA(---)" );
		}

		if( algorithm > -1 )
		{
			/* Scalling fitnesses */
			gotoxy( 0, 5 + yStart );
			SetTextColor( 10, 0 );
			printf("GEN(%.3d) : ", iGen );
			SetTextColor( 14, 0 );
			printf("FIT : ");
			ab_GPU<<<BlocksPerGrid, 1>>>( Omin_d, Oavg_d, Omax_d, a_d, b_d, maxGA ); 
			cudaErr = cudaGetLastError();
			if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
			cudaThreadSynchronize();
			O2F_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( O_d, a_d, b_d, F_d, Data.TotalSize ); 
			cudaErr = cudaGetLastError();
			if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
			cudaThreadSynchronize();

			/* Selection */
			printf("SEL : ");
			tournaments_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( dRNV_d, rnvofs, F_d, ixo_d, ixc_d, iyc_d, isig0_d, jxo_d, jxc_d, jyc_d, jsig0_d, Data.TotalSize );
			cudaErr = cudaGetLastError();
			if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
			cudaThreadSynchronize();
			rnvofs += Data.TotalSize * 4;

			/* Crossing-over */
			printf("SXX : ");	
			crossover_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( dRNV_d, usRNV_d, rnvofs, Data.CrossP, jxo_d, jxc_d, jyc_d, jsig0_d, ixo_d, ixc_d, iyc_d, isig0_d, Data.TotalSize );
			cudaErr = cudaGetLastError();
			if( cudaErr != 0 ) { SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
			cudaThreadSynchronize();
			rnvofs += Data.TotalSize * 6;

			printf("MUT : ");	
			mutation_GPU<<<BlocksPerGrid, ThreadsPerBlock>>>( dRNV_d, rnvofs, Data.MutP, ixo_d, ixc_d, iyc_d, isig0_d, Data.TotalSize );
			cudaErr = cudaGetLastError();
			if( cudaErr != 0 ) { SetTextColor( 12, 0 ); SetTextColor( 12, 0 ); printf( "%s", cudaGetErrorString( cudaErr )); goto cleanup; }
			cudaThreadSynchronize();
			rnvofs += Data.TotalSize * 64;			
			
			SetTextColor( 0, 15 );
			printf("OK");
		}//GA
		else
		{
			gotoxy( 0, 5 + yStart );
			SetTextColor( 10, 0 );
			printf("GEN(---)" );
		}
		//repeated cycles
		if( iGen == ( iCycle * Cycle ))
		{
			iCycle++;
			reStart = 1;
		}
		iGen++;
		jGen++;
	}while( jGen < maxGenRNV );

	if( iGen < maxGen ) goto reRNV;

cleanup:

	gotoxy( 0, 20 + yStart );
	SetTextDefault();

	/* closing output streams*/
	if( infofile ) fclose(infofile);
	if( timingfile ) fclose(timingfile);

	/* Free device memory */
	cudaFree(zero_d);
	cudaFree(xo_d);	cudaFree(xc_d);	cudaFree(yc_d);	cudaFree(sig0_d);
	cudaFree(Texit_d); cudaFree(R_d); cudaFree(T_d); cudaFree(H_d); 
	cudaFree(O_d); cudaFree(dRNV_d); cudaFree(usRNV_d);
	cudaFree(ixo_d); cudaFree(ixc_d); cudaFree(iyc_d); cudaFree(isig0_d);
	cudaFree(jxo_d); cudaFree(jxc_d); cudaFree(jyc_d); cudaFree(jsig0_d);
	cudaFree(Omin_d); cudaFree(Oavg_d); cudaFree(Omax_d); cudaFree(Err_d);
	cudaFree(a_d); cudaFree(b_d); cudaFree(F_d); cudaFree(Idx_d);

	/* Free host memory */
    free(zero_h);
	free(xo_h); free(xc_h); free(yc_h); free(sig0_h); free(T_h);
	free(R_h);  free(H_h); free(O_h); free(usRNV_h); free(dRNV_h);
	free(Omin_h); free(Oavg_h); free(Omax_h); free(Err_h); 
	
	printf("Results saved in %s\n", WORK_PATH );
	printf("Press <Enter>...");
	char c = getchar();
	return 0;
}
