/* 
Massive Parallel Genetic Algorithm for ATI/AMD Graphics Processing Units
Copyright (c) 2009, Piotr E. Srokosz, University of Warmia and Mazury in Olsztyn
All rights reserved.
*/

/* 
Mersenne Twister Random Number Generator
Copyright (c) 2007, Advanced Micro Devices, Inc.
All rights reserved.

Bitonic sorter
Copyright (c) 2003, Stanford University
All rights reserved.
Copyright (c) 2007, Advanced Micro Devices, Inc.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice,
  this list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of Advanced Micro Devices, Inc nor the names of its contributors
  may be used to endorse or promote products derived from this software
  without specific prior written permission.

* Neither the name of Stanford University nor the names of any contributors
  may be used to endorse or promote products derived from this software
  without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

*/
/*****************************************************************************

   The commonly used variant of Mersenne Twister, MT19937 is implemented.
   Algorithm and pseudocode: http://en.wikipedia.org/wiki/Mersenne_twister

   The GPU implementation is SFMT implementation. The algorithm sfmt implementation
   could be found at :http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/SFMT/index.html
 
 ****************************************************************************/

/***************************************************************************

  Parallel Bitonic Sorter Sorts Length numbers in O(Length * lg^2(Length) ) time.
  Length should be a power of two

  A good explanation of the parallel sorting algorithm can be found at:
  http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm

****************************************************************************/


/* Project headers */
#include <stdio.h>
#include <stdlib.h>
#include <sys/timeb.h>
#include <time.h>
#include <memory.h>
#define _USE_MATH_DEFINES
#include <math.h>
#include "brookgenfiles/GAVARkernels.h"


/* Kernel CPU (genetic algorithm): creating seed for RNG */
void
CreateSeed_CPU(uint4* seed, const unsigned int sizeGA, const unsigned int sizePop)
{
	unsigned int iga, jpop, pos;
	
	for(iga = 0; iga < sizeGA; iga++)
	{
		for(jpop = 0; jpop < sizePop; jpop++)
		{
			pos = jpop * sizeGA + iga;
			seed[pos].x = (unsigned int)( rand() * rand() );
			seed[pos].y = (unsigned int)( rand() * rand() );
			seed[pos].z = (unsigned int)( rand() * rand() );
			seed[pos].w = (unsigned int)( rand() * rand() );
		}
	}
}

/* Kernel CPU (genetic algorithm): Readdressing best individuals */
void
ReAddress_CPU(ushort4* nCvar, int4* xnvar, int4* ynvar, int4* xrvar, int4* yrvar, const unsigned int sizeGA, const unsigned int sizePop)
{
	unsigned int iga, jpop, kpop, pos, posk;
	uint4 iRem;
	
	/* multiplying best individuals' addresses */
	for(iga = 0; iga < sizeGA; iga++)
	{
		for(jpop = 0; jpop < sizePop; jpop++)
		{
			pos = jpop * sizeGA + iga;
			kpop = 0;
			while(( nCvar[pos].x > 1 ) && ( kpop < sizePop ))
			{
				posk = kpop * sizeGA + iga;
				if( nCvar[posk].x == (ushort)0 )
				{
					nCvar[pos].x -= 1;
					nCvar[posk].x = 1;
					xnvar[posk].x = xnvar[pos].x;
					ynvar[posk].x = ynvar[pos].x;
				}
				kpop++;
			}
			
			kpop = 0;
			while(( nCvar[pos].y > 1 ) && ( kpop < sizePop ))
			{
				posk = kpop * sizeGA + iga;
				if( nCvar[posk].y == (ushort)0 )
				{
					nCvar[pos].y -= 1;
					nCvar[posk].y = 1;
					xnvar[posk].y = xnvar[pos].y;
					ynvar[posk].y = ynvar[pos].y;
				}
				kpop++;
			}

			kpop = 0;
			while(( nCvar[pos].z > 1 ) && ( kpop < sizePop ))
			{
				posk = kpop * sizeGA + iga;
				if( nCvar[posk].z == (ushort)0 )
				{
					nCvar[pos].z -= 1;
					nCvar[posk].z = 1;
					xnvar[posk].z = xnvar[pos].z;
					ynvar[posk].z = ynvar[pos].z;
				}
				kpop++;
			}

			kpop = 0;
			while(( nCvar[pos].w > 1 ) && ( kpop < sizePop ))
			{
				posk = kpop * sizeGA + iga;
				if( nCvar[posk].w == (ushort)0 )
				{
					nCvar[pos].w -= 1;
					nCvar[posk].w = 1;
					xnvar[posk].w = xnvar[pos].w;
					ynvar[posk].w = ynvar[pos].w;
				}
				kpop++;
			}
		}//jpop
	}//iga
	
	/* Filling empty places */
	for(iga = 0; iga < sizeGA; iga++)
	{
		iRem.x = sizePop;
		iRem.y = sizePop;
		iRem.z = sizePop;
		iRem.w = sizePop;
		for(jpop = 0; jpop < sizePop; jpop++)
		{
			pos = jpop * sizeGA + iga;
			if( nCvar[pos].x == 0 )
			{ 
				posk = (iRem.x-1) * sizeGA + iga;
				xnvar[pos].x = xrvar[posk].x;
				ynvar[pos].x = yrvar[posk].x;
				nCvar[pos].x = 1;
				iRem.x--;
			}
			if( nCvar[pos].y == 0 )
			{ 
				posk = (iRem.y-1) * sizeGA + iga;
				xnvar[pos].y = xrvar[posk].y;
				ynvar[pos].y = yrvar[posk].y;
				nCvar[pos].y = 1;
				iRem.y--;
			}
			if( nCvar[pos].z == 0 )
			{ 
				posk = (iRem.z-1) * sizeGA + iga;
				xnvar[pos].z = xrvar[posk].z;
				ynvar[pos].z = yrvar[posk].z;
				nCvar[pos].z = 1;
				iRem.z--;
			}
			if( nCvar[pos].w == 0 )
			{ 
				posk = (iRem.w-1) * sizeGA + iga;
				xnvar[pos].w = xrvar[posk].w;
				ynvar[pos].w = yrvar[posk].w;
				nCvar[pos].w = 1;
				iRem.w--;
			}
		}
	}
}

/* Kernel CPU (IO operations): saves headers in files */
void
init_CPU( void )
{
	FILE *outfile;
	
	if( (outfile = fopen( "results.txt", "wt" )) == NULL )
	{
		printf( "Cannot create results file.\n" );
		return;
	}

	fprintf(outfile,"Gener.     iGA  L       Obj   minObj   avgObj   maxObj        R        T        H       xo       xc       yc     sig0\n" );

	if( outfile )
	{
		if ( fclose( outfile ) )
		{
			printf( "The results file was not closed.\n" );
		}
	}

	if( (outfile = fopen( "best.txt", "wt" )) == NULL )
	{
		printf( "Cannot create best results file.\n" );
		return;
	}
	fprintf(outfile,"Gener.     minObj   avgObj   maxObj        R        T        H       xo       xc       yc     sig0\n" );

	if( outfile )
	{
		if ( fclose( outfile ) )
		{
			printf( "The best results file was not closed.\n" );
		}
	}

	if( (outfile = fopen( "populations.txt", "wt" )) == NULL )
	{
		printf( "Cannot create populations file.\n" );
		return;
	}
	fprintf(outfile,"Gener.     iGA  iPop  L       Obj        R        T        H       xo       xc       yc     sig0\n" );

	if( outfile )
	{
		if ( fclose( outfile ) )
		{
			printf( "The populations file was not closed.\n" );
		}
	}

	printf("Slope Stability + GA (GPGPU) v.1.01 beta, P.Srokosz, 2009.\n");
}

/* Kernel CPU (genetic algorithm): saving best individuals and statistics */
void
save_CPU( uint itr, float4* R, float4* T, float4* O, float4* H, 
		  float4* Oavg, float4* Omin, float4* Omax, 
		  float4* xo, float4* xc, float4* yc, float4* sig0, float *Hcr, float *Lcr,
		  const unsigned int sizeGA, const unsigned int sizePop )
{
	FILE *outfile, *bestfile;
	unsigned int iga, jpop, pos;
	float4 maxObj = float4(0.0f,0.0f,0.0f,0.0f);
	uint4 pos4;
	float Best = 0.0f, Avg, Min, Max, Rb, Tb, Hb, xob, xcb, ycb, sig0b;
	unsigned int iBest;
	
	if( (outfile = fopen( "results.txt", "at" )) == NULL )
	{
		printf( "Cannot open results file.\n" );
		return;
	}

	if( (bestfile = fopen( "best.txt", "at" )) == NULL )
	{
		printf( "Cannot open best results file.\n" );
		fclose( outfile );
		return;
	}

	for(iga = 0; iga < sizeGA; iga++)
	{
		maxObj = float4(0.0f,0.0f,0.0f,0.0f);
		for(jpop = 0; jpop < sizePop; jpop++)
		{
			pos = jpop * sizeGA + iga;
			if( O[pos].x > maxObj.x )
			{
				maxObj.x = O[pos].x;
				pos4.x = pos;
			}
			if( O[pos].y > maxObj.y )
			{
				maxObj.y = O[pos].y;
				pos4.y = pos;
			}
			if( O[pos].z > maxObj.z )
			{
				maxObj.z = O[pos].z;
				pos4.z = pos;
			}
			if( O[pos].w > maxObj.w )
			{
				maxObj.w = O[pos].w;
				pos4.w = pos;
			}
		}
		if( Best < maxObj.x ) 
		{ 
			Best = maxObj.x; 
			iBest = pos4.x; 
			Avg = Oavg[iga].x;
			Min = Omin[iga].x;
			Max = Omax[iga].x;
			Rb = R[iBest].x;
			Tb = T[iBest].x;
			Hb = H[iBest].x;
			xob = xo[iBest].x;
			xcb = xc[iBest].x;
			ycb = yc[iBest].x;
			sig0b = sig0[iBest].x;
		}
		if( Best < maxObj.y ) 
		{ 
			Best = maxObj.y; 
			iBest = pos4.y;
			Avg = Oavg[iga].y;
			Min = Omin[iga].y;
			Max = Omax[iga].y;
			Rb = R[iBest].y;
			Tb = T[iBest].y;
			Hb = H[iBest].y;
			xob = xo[iBest].y;
			xcb = xc[iBest].y;
			ycb = yc[iBest].y;
			sig0b = sig0[iBest].y;
		}
		if( Best < maxObj.z ) 
		{ 
			Best = maxObj.z; 
			iBest = pos4.z; 
			Avg = Oavg[iga].z;
			Min = Omin[iga].z;
			Max = Omax[iga].z;
			Rb = R[iBest].z;
			Tb = T[iBest].z;
			Hb = H[iBest].z;
			xob = xo[iBest].z;
			xcb = xc[iBest].z;
			ycb = yc[iBest].z;
			sig0b = sig0[iBest].z;
		}
		if( Best < maxObj.w ) 
		{ 
			Best = maxObj.w; 
			iBest = pos4.w; 
			Avg = Oavg[iga].w;
			Min = Omin[iga].w;
			Max = Omax[iga].w;
			Rb = R[iBest].w;
			Tb = T[iBest].w;
			Hb = H[iBest].w;
			xob = xo[iBest].w;
			xcb = xc[iBest].w;
			ycb = yc[iBest].w;
			sig0b = sig0[iBest].w;
		}
		fprintf(outfile,"Gen: %.3d i: %.3d x: %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f\n", itr, iga, O[pos4.x].x, Omin[iga].x, Oavg[iga].x, Omax[iga].x, R[pos4.x].x, T[pos4.x].x, H[pos4.x].x, xo[pos4.x].x, xc[pos4.x].x, yc[pos4.x].x, sig0[pos4.x].x );
		fprintf(outfile,"Gen: %.3d i: %.3d y: %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f\n", itr, iga, O[pos4.y].y, Omin[iga].y, Oavg[iga].y, Omax[iga].y, R[pos4.y].y, T[pos4.y].y, H[pos4.y].y, xo[pos4.y].y, xc[pos4.y].y, yc[pos4.y].y, sig0[pos4.y].y );
		fprintf(outfile,"Gen: %.3d i: %.3d z: %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f\n", itr, iga, O[pos4.z].z, Omin[iga].z, Oavg[iga].z, Omax[iga].z, R[pos4.z].z, T[pos4.z].z, H[pos4.z].z, xo[pos4.z].z, xc[pos4.z].z, yc[pos4.z].z, sig0[pos4.z].z );
		fprintf(outfile,"Gen: %.3d i: %.3d w: %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f\n\n", itr, iga, O[pos4.w].w, Omin[iga].w, Oavg[iga].w, Omax[iga].w, R[pos4.w].w, T[pos4.w].w, H[pos4.w].w, xo[pos4.w].w, xc[pos4.w].w, yc[pos4.w].w, sig0[pos4.w].w );
	}
	fprintf(bestfile,"Gen: %.3d %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f\n", itr, Min, Avg, Max, Rb, Tb, Hb, xob, xcb, ycb, sig0b );

	*Hcr = Hb;
	*Lcr = xob;

	if( outfile )
	{
		if ( fclose( outfile ) )
		{
			printf( "The results file was not closed.\n" );
		}
	}

	if( bestfile )
	{
		if ( fclose( bestfile ) )
		{
			printf( "The best results file was not closed.\n" );
		}
	}
}

/* Kernel CPU (genetic algorithm): saving all populations and statistics */
void
saveall_CPU( uint itr, float4* R, float4* T, float4* O, float4* H, 
		     float4* xo, float4* xc, float4* yc, float4* sig0,
		     const unsigned int sizeGA, const unsigned int sizePop )
{
	FILE *outfile;
	unsigned int iga, jpop, pos;
	
	if( (outfile = fopen( "populations.txt", "at" )) == NULL )
	{
		printf( "Cannot open populations file.\n" );
		return;
	}

	for( iga = 0; iga < sizeGA; iga++ )
	{
		for( jpop = 0; jpop < sizePop; jpop++ )
		{
			pos = jpop * sizeGA + iga;
			fprintf(outfile,"Gen: %.3d i: %.3d j: %.3d x: %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f\n", itr, iga, jpop, O[pos].x, R[pos].x, T[pos].x, H[pos].x, xo[pos].x, xc[pos].x, yc[pos].x, sig0[pos].x );
			fprintf(outfile,"Gen: %.3d i: %.3d j: %.3d y: %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f\n", itr, iga, jpop, O[pos].y, R[pos].y, T[pos].y, H[pos].y, xo[pos].y, xc[pos].y, yc[pos].y, sig0[pos].y );
			fprintf(outfile,"Gen: %.3d i: %.3d j: %.3d z: %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f\n", itr, iga, jpop, O[pos].z, R[pos].z, T[pos].z, H[pos].z, xo[pos].z, xc[pos].z, yc[pos].z, sig0[pos].z );
			fprintf(outfile,"Gen: %.3d i: %.3d j: %.3d w: %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f\n\n", itr, iga, jpop, O[pos].w, R[pos].w, T[pos].w, H[pos].w, xo[pos].w, xc[pos].w, yc[pos].w, sig0[pos].w );
		}
	}

	if( outfile )
	{
		if ( fclose( outfile ) )
		{
			printf( "The populations file was not closed.\n" );
		}
	}
}

int
main(int argc, char* argv[])
{
	/* ----- CONTROL ----- */
	/* algorithm = -1 : random searching
	   algorithm = 0 : GA
	   algorithm = 1 : PGA
	*/
	int algorithm = 1;

	/* ----- DATA ----- */
	
	unsigned int maxPop = 256; //HAS TO BE POWER OF 2
	unsigned int maxGA = 64; // multiple of secGA
	         int secGA = 8; //section of GA in toroidal structure
			 int typGA = 7; //type of migration structure: 0 - random, 7 - L7
	unsigned int maxGen = 1000;
	unsigned int intGA = 125; //renewing interval
	
	float Alfa = 0.0f * (float)M_PI / 180.0f;
	float Beta = 80.0f * (float)M_PI  / 180.0f; 
	float Fi = 20.0f * (float)M_PI / 180.0f;
	float coh = 15.0f;
	float gam = 24.0f;
	float wT = 0.87f;
	float wR = 0.87f;
	float wK = 0.0f;
	float wH = 0.99f;
	float OBJstart = 1000.0f;
//
	float maxxo = -1.0f, minxo = -6.11f;
	float maxxc = 6.11f, minxc = 1.0f;
	float maxyc = -1.0f, minyc = -6.11f;
	float maxsig0 = -2.0f, minsig0 = -12.23f;
/*
	float maxxo = -2.24f, minxo = -2.24f;
	float maxxc = 4.0f, minxc = 3.0f;
	float maxyc = -4.0f, minyc = -4.5f;
	float maxsig0 = -8.0f, minsig0 = -9.0f;
*/
	ushort xoBits = (ushort)9;
	ushort xcBits = (ushort)9;
	ushort ycBits = (ushort)9;
	ushort sig0Bits = (ushort)10;
	float CrossP = 1.0f;
	float MutP = 0.005f;
	float MigP = 1.0f/(float)maxPop; //individuals per population

	/* ----- VARIABLES ----- */
	float tanFi = tanf( Fi ), tanBeta = tanf( Beta ), tanAlfa = tanf( Alfa ), Tetag = (float)M_PI / 2.0f - Beta;	
	float Hres, Lres;
	float4 tanFi4, coh4, gam4, tanBeta4, tanAlfa4, Tetag4, wT4, wR4, wK4, wH4, maxPop4, maxGA4, Zero4, CrossP4, MutP4;
	float4 minsig04, maxsig04, minxo4, maxxo4, minxc4, maxxc4, minyc4, maxyc4, OBJstart4, MigP4;
	ushort4 sig0Bits4, xoBits4, xcBits4, ycBits4;
	int2 result;
	int childproc, lgArraySize, flip, stage;
	unsigned int iGen, iRen;
	clock_t start, total, finish;
	FILE *infofile, *timingfile;

	/* extend definitions of constants */
	tanFi4.x = tanFi; tanFi4.y = tanFi; tanFi4.z = tanFi; tanFi4.w = tanFi;
	coh4.x = coh; coh4.y = coh; coh4.z = coh; coh4.w = coh;
	gam4.x = gam; gam4.y = gam; gam4.z = gam; gam4.w = gam;
	tanAlfa4.x = tanAlfa; tanAlfa4.y = tanAlfa; tanAlfa4.z = tanAlfa; tanAlfa4.w = tanAlfa;
	tanBeta4.x = tanBeta; tanBeta4.y = tanBeta; tanBeta4.z = tanBeta; tanBeta4.w = tanBeta;
	Tetag4.x = Tetag; Tetag4.y = Tetag; Tetag4.z = Tetag; Tetag4.w = Tetag;
	wT4.x = wT; wT4.y = wT; wT4.z = wT; wT4.w = wT;
	wR4.x = wR; wR4.y = wR; wR4.z = wR; wR4.w = wR;
	wK4.x = wK; wK4.y = wK; wK4.z = wK; wK4.w = wK;
	wH4.x = wH; wH4.y = wH; wH4.z = wH; wH4.w = wH;
	minsig04.x = minsig0; minsig04.y = minsig0; minsig04.z = minsig0; minsig04.w = minsig0; 
	maxsig04.x = maxsig0; maxsig04.y = maxsig0; maxsig04.z = maxsig0; maxsig04.w = maxsig0;	
	minxo4.x = minxo; minxo4.y = minxo; minxo4.z = minxo; minxo4.w = minxo;
	maxxo4.x = maxxo; maxxo4.y = maxxo; maxxo4.z = maxxo; maxxo4.w = maxxo;    
	minxc4.x = minxc; minxc4.y = minxc; minxc4.z = minxc; minxc4.w = minxc;
	maxxc4.x = maxxc; maxxc4.y = maxxc; maxxc4.z = maxxc; maxxc4.w = maxxc;
	minyc4.x = minyc; minyc4.y = minyc; minyc4.z = minyc; minyc4.w = minyc;
	maxyc4.x = maxyc; maxyc4.y = maxyc; maxyc4.z = maxyc; maxyc4.w = maxyc;
	sig0Bits4.x = sig0Bits; sig0Bits4.y = sig0Bits; sig0Bits4.z = sig0Bits; sig0Bits4.w = sig0Bits;
	xoBits4.x = xoBits; xoBits4.y = xoBits; xoBits4.z = xoBits; xoBits4.w = xoBits;
	xcBits4.x = xcBits; xcBits4.y = xcBits; xcBits4.z = xcBits; xcBits4.w = xcBits;
	ycBits4.x = ycBits; ycBits4.y = ycBits; ycBits4.z = ycBits; ycBits4.w = ycBits;
	maxPop4.x = (float)maxPop; maxPop4.y = (float)maxPop; maxPop4.z = (float)maxPop; maxPop4.w = (float)maxPop;
	maxGA4.x = (float)maxGA; maxGA4.y = (float)maxGA; maxGA4.z = (float)maxGA; maxGA4.w = (float)maxGA;
	OBJstart4.x = OBJstart; OBJstart4.y = OBJstart; OBJstart4.z = OBJstart; OBJstart4.w = OBJstart;
	Zero4.x = 0.0f; Zero4.y = 0.0f; Zero4.z = 0.0f; Zero4.w = 0.0f;
	CrossP4.x = CrossP; CrossP4.y = CrossP; CrossP4.z = CrossP; CrossP4.w = CrossP;
	MutP4.x = MutP; MutP4.y = MutP; MutP4.z = MutP; MutP4.w = MutP;
	MigP4.x = MigP; MigP4.y = MigP; MigP4.z = MigP; MigP4.w = MigP;

	/* Initializing dimensions and rank */
    unsigned int streamSizeGA[] = {maxGA}, streamSizeSys[] = {maxGA, maxPop};
    unsigned int rankGA = 1, rankSys = 2;
	
	/* creating CPU working variables */
    float4* xo4 = new float4[maxGA * maxPop];
	memset( xo4, 0, maxGA * maxPop * sizeof(float4));
    float4* xc4 = new float4[maxGA * maxPop];
	memset( xc4, 0, maxGA * maxPop * sizeof(float4));
    float4* yc4 = new float4[maxGA * maxPop];
	memset( yc4, 0, maxGA * maxPop * sizeof(float4));
    float4* sig04 = new float4[maxGA * maxPop];
	memset( sig04, 0, maxGA * maxPop * sizeof(float4));
    float4* R4 = new float4[maxGA * maxPop];
	memset( R4, 0, maxGA * maxPop * sizeof(float4));
    float4* T4 = new float4[maxGA * maxPop];
	memset( T4, 0, maxGA * maxPop * sizeof(float4));
    float4* H4 = new float4[maxGA * maxPop];
	memset( H4, 0, maxGA * maxPop * sizeof(float4));
    float4* O4 = new float4[maxGA * maxPop];
	memset( O4, 0, maxGA * maxPop * sizeof(float4));
    float4* Oavg4 = new float4[maxGA];
	memset( Oavg4, 0, maxGA * sizeof(float4));
    float4* Omin4 = new float4[maxGA];
	memset( Omin4, 0, maxGA * sizeof(float4));
    float4* Omax4 = new float4[maxGA];
	memset( Omax4, 0, maxGA * sizeof(float4));
    uint4* seed4 = new uint4[maxGA * maxPop];
	memset( seed4, 0, maxGA * maxPop * sizeof(uint4));
    int4* X4 = new int4[maxGA * maxPop];
	memset( X4, 0, maxGA * maxPop * sizeof(int4));
    int4* Y4 = new int4[maxGA * maxPop];
	memset( Y4, 0, maxGA * maxPop * sizeof(int4));
    ushort4* nC4 = new ushort4[maxGA * maxPop];
	memset( nC4, 0, maxGA * maxPop * sizeof(ushort4));
    int4* Xr4 = new int4[maxGA * maxPop];
	memset( Xr4, 0, maxGA * maxPop * sizeof(int4));
    int4* Yr4 = new int4[maxGA * maxPop];
	memset( Yr4, 0, maxGA * maxPop * sizeof(int4));


	/* creating GPU working streams */
	//float4 2D
	brook::Stream<float4> xoStream(rankSys, streamSizeSys);
	brook::Stream<float4> xcStream(rankSys, streamSizeSys);
	brook::Stream<float4> ycStream(rankSys, streamSizeSys);
	brook::Stream<float4> TeStream(rankSys, streamSizeSys);
	brook::Stream<float4> sig0Stream(rankSys, streamSizeSys);
	brook::Stream<float4> RStream(rankSys, streamSizeSys);
	brook::Stream<float4> TStream(rankSys, streamSizeSys);
	brook::Stream<float4> HStream(rankSys, streamSizeSys);
	brook::Stream<float4> OStream(rankSys, streamSizeSys);
	brook::Stream<float4> newOStream(rankSys, streamSizeSys);
	brook::Stream<float4> OC1Stream(rankSys, streamSizeSys);
	brook::Stream<float4> OC2Stream(rankSys, streamSizeSys);
	brook::Stream<float4> RC1Stream(rankSys, streamSizeSys);
	brook::Stream<float4> RC2Stream(rankSys, streamSizeSys);
	brook::Stream<float4> TC1Stream(rankSys, streamSizeSys);
	brook::Stream<float4> TC2Stream(rankSys, streamSizeSys);
	brook::Stream<float4> HC1Stream(rankSys, streamSizeSys);
	brook::Stream<float4> HC2Stream(rankSys, streamSizeSys);
	brook::Stream<float4> FStream(rankSys, streamSizeSys);
	brook::Stream<float4> RemStream(rankSys, streamSizeSys);
	brook::Stream<float4> SortedStream(rankSys, streamSizeSys);
	//float4 1D
	brook::Stream<float4> OsumStream(rankGA, streamSizeGA);
	brook::Stream<float4> OavgStream(rankGA, streamSizeGA);
	brook::Stream<float4> OmaxStream(rankGA, streamSizeGA);
	brook::Stream<float4> OminStream(rankGA, streamSizeGA);
	brook::Stream<float4> FsumStream(rankGA, streamSizeGA);
	brook::Stream<float4> FavgStream(rankGA, streamSizeGA);
	brook::Stream<float4> FmaxStream(rankGA, streamSizeGA);
	brook::Stream<float4> aStream(rankGA, streamSizeGA);
	brook::Stream<float4> bStream(rankGA, streamSizeGA);
	//int4 2D
	brook::Stream<int4> XcStream(rankSys, streamSizeSys);
	brook::Stream<int4> YcStream(rankSys, streamSizeSys);
	brook::Stream<int4> XrStream(rankSys, streamSizeSys);
	brook::Stream<int4> YrStream(rankSys, streamSizeSys);
	brook::Stream<int4> XsStream(rankSys, streamSizeSys);
	brook::Stream<int4> YsStream(rankSys, streamSizeSys);
	//ushort4 2D
	brook::Stream<ushort4> ixoStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> ixcStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> iycStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> isig0Stream(rankSys, streamSizeSys);
	brook::Stream<ushort4> jxoStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> jxcStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> jycStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> jsig0Stream(rankSys, streamSizeSys);
	brook::Stream<ushort4> kxoStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> kxcStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> kycStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> ksig0Stream(rankSys, streamSizeSys);
	brook::Stream<ushort4> mxoStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> mxcStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> mycStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> msig0Stream(rankSys, streamSizeSys);
	brook::Stream<ushort4> c1xoStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> c1xcStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> c1ycStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> c1sig0Stream(rankSys, streamSizeSys);
	brook::Stream<ushort4> c2xoStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> c2xcStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> c2ycStream(rankSys, streamSizeSys);
	brook::Stream<ushort4> c2sig0Stream(rankSys, streamSizeSys);
	brook::Stream<ushort4> nCopiesStream(rankSys, streamSizeSys);
	//ushort4 1D
	brook::Stream<ushort4> bestxoStream(rankGA, streamSizeGA);
	brook::Stream<ushort4> bestxcStream(rankGA, streamSizeGA);
	brook::Stream<ushort4> bestycStream(rankGA, streamSizeGA);
	brook::Stream<ushort4> bestsig0Stream(rankGA, streamSizeGA);
	//uint4 2D
	brook::Stream<uint4> seedStream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand1Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand2Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand3Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand4Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand5Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand6Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand7Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand8Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand9Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand10Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand11Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand12Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand13Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand14Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand15Stream(rankSys, streamSizeSys);
	brook::Stream<uint4> irand16Stream(rankSys, streamSizeSys);

	/* ----- CALCULATIONS ----- */

	/* Initializing CPU operations */
	init_CPU( );
	if( (infofile = fopen( "info.txt", "wt" )) == NULL )
	{
		printf( "Cannot create info file.\n" );
		goto cleanup;
	}

	if( (timingfile = fopen( "timing.txt", "wt" )) == NULL )
	{
		printf( "Cannot create timing file.\n" );
		goto cleanup;
	}
	fprintf(timingfile,"     Hcr[m]      xo[m]       t[s]\n");

	iGen = 1;
	iRen = 1;
	printf("STR : (%.3d) : ", iGen);
	fprintf(infofile,"STR : (%.3d) : ", iGen);

	/* Initializing parameters for bitonic sorter */
	lgArraySize = 0;
	stage = 0;

	/* Calculating number of stages for bitonic sorter */
	for (stage = maxPop >> 1; stage; lgArraySize++)
    {
        stage >>= 1;
    }


	/* Initializing RNG, creating starting seeds */
	srand( (unsigned)time( NULL ) );
	total = clock();
	start = total;

restart:

	/* Initializing GPU operations */

	printf("Clearing GPU streams.\n");
	//float4 2D
	clear_f4_GPU( xoStream, xcStream, ycStream, sig0Stream,
		          TeStream, RStream, TStream, HStream );
	clear_f4_GPU( OStream, newOStream, OC1Stream, OC2Stream,
		          RC1Stream, RC2Stream, TC1Stream, TC2Stream );
	clear_f4_GPU( HC1Stream, HC2Stream, FStream, RemStream,
		          SortedStream, RC2Stream, TC1Stream, TC2Stream );

	//float4 1D
	clear_f4_GPU( OsumStream, OavgStream, OminStream, OmaxStream,
		          FsumStream, FavgStream, FmaxStream, aStream );
	clear_f4_GPU( OsumStream, OavgStream, OminStream, OmaxStream,
		          FsumStream, FavgStream, FmaxStream, bStream );
	//int4 2D
	clear_i4_GPU( XcStream, YcStream, XrStream, YrStream, XsStream, YsStream );

	//uint4 2D
	clear_ui4_GPU( irand1Stream, irand2Stream, irand3Stream, irand4Stream,
		           irand5Stream, irand6Stream, irand7Stream, irand8Stream );
	clear_ui4_GPU( irand9Stream, irand10Stream, irand11Stream, irand12Stream,
		           irand13Stream, irand14Stream, irand15Stream, irand16Stream );
	//ushort4 2D
	clear_us4_GPU( ixoStream, ixcStream, iycStream, isig0Stream,
		           jxoStream, jxcStream, jycStream, jsig0Stream );
	clear_us4_GPU( kxoStream, kxcStream, kycStream, ksig0Stream,
		           mxoStream, mxcStream, mycStream, msig0Stream );
	clear_us4_GPU( c1xoStream, c1xcStream, c1ycStream, c1sig0Stream,
		           c2xoStream, c2xcStream, c2ycStream, c2sig0Stream );
	clear_us4a_GPU( c2xoStream, c2xcStream, c2ycStream, nCopiesStream );

	//ushort4 1D
	clear_us4a_GPU( bestxoStream, bestxcStream, bestycStream, bestsig0Stream );

	CreateSeed_CPU( seed4, maxGA, maxPop );
    
	/* Reading seed into seed stream */
    seedStream.read( seed4 );
    seedStream.finish();
	
	/* Generating random numbers */
    generateNumber_GPU( seedStream, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream, irand6Stream, irand7Stream, irand8Stream );
	copy_GPU( irand8Stream, seedStream );

	/* Making starting population (genotypes) */
	fprintf(infofile, "MAK : ");
	MakePopulation_GPU( irand1Stream, irand2Stream, irand3Stream, irand4Stream, xoBits4, xcBits4, ycBits4, sig0Bits4,
				        ixoStream, ixcStream, iycStream, isig0Stream );

	/* Decoding variables (phenotypes) */
	fprintf(infofile,"DEC : ");
	decode_GPU( ixoStream, ixcStream, iycStream, isig0Stream, 
		        maxxo4, minxo4, xoBits4, maxxc4, minxc4, xcBits4,
			    maxyc4, minyc4, ycBits4, maxsig04, minsig04, sig0Bits4,
				xoStream, xcStream, ycStream, sig0Stream );
	
	/* Finding limits of integration */
	printf("LIM : ");
	fprintf(infofile,"LIM : ");
	find_teta_GPU( xoStream, xcStream, ycStream, tanAlfa4, Tetag4, tanFi4, TeStream );

	/* Evaluating objectives */
	printf("OBJ : ");
	fprintf(infofile,"OBJ : ");
	RTO_GPU( xoStream, xcStream, ycStream, sig0Stream, TeStream, OBJstart4, tanAlfa4, tanBeta4, tanFi4, coh4, gam4, wR4, wT4, wK4, wH4, RStream, TStream, OStream, HStream );

	/* Random searching */
	if( algorithm == -1 ) goto showinfo;

neweval:

	/* PGA - full GA */
	if(( algorithm == 1 ) && ( iGen > 1u ))
	{
		printf("MIG : ");
		fprintf(infofile,"MIG : ");
		
		/* Generating random numbers */
		generateNumber_GPU( seedStream, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream, irand6Stream, irand7Stream, irand8Stream );
		copy_GPU( irand8Stream, seedStream );
		
		/* migration of best individuals across the system */
		if( typGA == 0 ) migration_GPU( maxGA4, MigP4, bestxoStream, bestxcStream, bestycStream, bestsig0Stream,
									OmaxStream, ixoStream, ixcStream, iycStream, isig0Stream, OStream,
									irand1Stream, irand2Stream, irand3Stream, jxoStream, jxcStream, jycStream, jsig0Stream, newOStream ); 
		
		/* migration of best individuals in Linear-7 system */
		if( typGA == 7 )
		{
			/* Generating random numbers */
			generateNumber_GPU( seedStream, irand9Stream, irand10Stream, irand11Stream, irand12Stream, irand13Stream, irand14Stream, irand15Stream, irand16Stream );
			copy_GPU( irand16Stream, seedStream );

			/* Generating random matrix with assumed probability */
			MT15bit_GPU( MigP4, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream,
				         irand6Stream, irand7Stream, irand8Stream, irand9Stream, irand10Stream,
					     irand11Stream, irand12Stream, irand13Stream, irand14Stream, irand15Stream,
						 nCopiesStream );
			migration_L7_GPU( secGA, maxGA, nCopiesStream, bestxoStream, bestxcStream, bestycStream, bestsig0Stream,
							  OmaxStream, ixoStream, ixcStream, iycStream, isig0Stream, OStream,
							  jxoStream, jxcStream, jycStream, jsig0Stream, newOStream ); 
		}

		/* restoring original variables */
		copy4_GPU( jxoStream, jxcStream, jycStream, jsig0Stream, ixoStream, ixcStream, iycStream, isig0Stream );
		copyf_GPU( newOStream, OStream );
	}
	
	/* Objective statistics */
	fprintf(infofile,"STA : ");
	fill_GPU( Zero4, OsumStream );
	sum_GPU( OStream, OsumStream );
	avg_GPU( OsumStream, maxPop4, OavgStream );
	min_max_best_GPU( maxPop, OStream, ixoStream, ixcStream, iycStream, isig0Stream, OminStream, OmaxStream, bestxoStream, bestxcStream, bestycStream, bestsig0Stream ); 
	
	/* Scalling fitnesses */
	printf("FIT : ");
	fprintf(infofile,"FIT : ");
	ab_GPU( OminStream, OavgStream, OmaxStream, aStream, bStream );
	O2F_GPU( OStream, aStream, bStream, FStream );

	/* Fitness statistics */
	fprintf(infofile,"STA : ");
	fill_GPU( Zero4, FsumStream );
	sum_GPU( FStream, FsumStream );	
	avg_GPU( FsumStream, maxPop4, FavgStream );
	max_GPU( maxPop, FStream, FmaxStream );

	/* Selection */
	printf("SEL : ");
	fprintf(infofile,"SEL : ");
	preselect_GPU( FStream, FmaxStream, nCopiesStream, XcStream, YcStream, RemStream, XrStream, YrStream ); 
	nCopiesStream.write( nC4 );
	XcStream.write( X4 );
	YcStream.write( Y4 );
	
	/* Sorting - bitonic sorter by Stanford University */
	flip = 0;	
	/* log(Length) stages */
    for (stage = 1; stage <= lgArraySize; ++stage)
    {
		int step = 0;
        /* Width of each sorted segment to be sorted in parallel (2, 4, 8, ...) */
        float segWidth = (float)pow(2.0f, stage);
        for (step = 1; step <= stage; ++step)
        {
			/* offset = (stageWidth/2, stageWidth/4, ... , 2, 1) */
            float offset = (float)pow(2.0f, stage - step);
            /* two buffers required since non-sequential gather is performed */
            /* from scratch buffer each step. */
            /* flip source and target streams each iteration */
            if (!flip)
				bitonic(RemStream, XrStream, YrStream, SortedStream, XsStream, YsStream, segWidth, offset, offset * 2.0f);
            else
				bitonic(SortedStream, XsStream, YsStream, RemStream, XrStream, YrStream, segWidth, offset, offset * 2.0f);
            flip ^= 0x01; // XOR flip w/ 0b1 which flips the flip variable between 0 and 1
		}
	}
    /* Write data back from streams - old form of procedures */
    streamWrite((flip) ? SortedStream : RemStream, R4);
    streamWrite((flip) ? XsStream : XrStream, Xr4);
    streamWrite((flip) ? YsStream : YrStream, Yr4);
	
	/* ReAddress individauls - CPU operation */
	printf("ADR : ");	
	fprintf(infofile,"ADR : ");	
	ReAddress_CPU(nC4, X4, Y4, Xr4, Yr4, maxGA, maxPop);
	XcStream.read(X4);
	YcStream.read(Y4);
	XcStream.finish();
	YcStream.finish();

	/* Copying Individuals as Parents */
	parents_GPU( ixoStream, ixcStream, iycStream, isig0Stream, XcStream, YcStream, jxoStream, jxcStream, jycStream, jsig0Stream );

	/* Generating random numbers */
    generateNumber_GPU( seedStream, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream, irand6Stream, irand7Stream, irand8Stream );
	copy_GPU( irand8Stream, seedStream );

	/* Selecting Parents */
	select_GPU( maxPop4, jxoStream, jxcStream, jycStream, jsig0Stream, irand1Stream, irand2Stream, ixoStream, ixcStream, iycStream, isig0Stream, kxoStream, kxcStream, kycStream, ksig0Stream );
				
	/* Generating random 0/1 with assumed probability */
	MT01_GPU( CrossP4, irand3Stream, irand8Stream );

	/* Cross-over matrix */
	crossmat_GPU( irand4Stream, irand5Stream, irand6Stream, irand7Stream, xoBits4, xcBits4, ycBits4, sig0Bits4, mxoStream, mxcStream, mycStream, msig0Stream );
	
	/* Crossing-over */
	printf("SXX\n");	
	fprintf(infofile,"SXX\n");	
	/* first stage */
	cross_GPU( ixoStream, ixcStream, kxoStream, kxcStream, xoBits4, xcBits4, mxoStream, mxcStream, irand8Stream, 
		   c1xoStream, c1xcStream, c2xoStream, c2xcStream );

	/* second stage */
	cross_GPU( iycStream, isig0Stream, kycStream, ksig0Stream, ycBits4, sig0Bits4, mycStream, msig0Stream, irand8Stream, 
		   c1ycStream, c1sig0Stream, c2ycStream, c2sig0Stream );
		
	for( childproc=1; childproc<3; childproc++ )
	{
		printf("              CH%d : ", childproc);
		fprintf(infofile,"              CH%d : ", childproc);
		if( childproc == 1 )		
			copy4_GPU( c1xoStream, c1xcStream, c1ycStream, c1sig0Stream, jxoStream, jxcStream, jycStream, jsig0Stream );
		else
			copy4_GPU( c2xoStream, c2xcStream, c2ycStream, c2sig0Stream, jxoStream, jxcStream, jycStream, jsig0Stream );
		
		printf("MUT : ");	
		fprintf(infofile,"MUT : ");	
		/* First variable */
		/* Generating random numbers */
		generateNumber_GPU( seedStream, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream, irand6Stream, irand7Stream, irand8Stream );
		copy_GPU( irand8Stream, seedStream );
	    generateNumber_GPU( seedStream, irand9Stream, irand10Stream, irand11Stream, irand12Stream, irand13Stream, irand14Stream, irand15Stream, irand16Stream );
		copy_GPU( irand16Stream, seedStream );

		/* Generating random matrices with assumed probability */
		MT15bit_GPU( MutP4, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream,
		                irand6Stream, irand7Stream, irand8Stream, irand9Stream, irand10Stream,
		                irand11Stream, irand12Stream, irand13Stream, irand14Stream, irand15Stream,
						mxoStream );

		/* Second variable */
		/* Generating random numbers */
		generateNumber_GPU( seedStream, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream, irand6Stream, irand7Stream, irand8Stream );
		copy_GPU( irand8Stream, seedStream );
		generateNumber_GPU( seedStream, irand9Stream, irand10Stream, irand11Stream, irand12Stream, irand13Stream, irand14Stream, irand15Stream, irand16Stream );
		copy_GPU( irand16Stream, seedStream );

		/* Generating random matrices with assumed probability */
		MT15bit_GPU( MutP4, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream,
		                irand6Stream, irand7Stream, irand8Stream, irand9Stream, irand10Stream,
		                irand11Stream, irand12Stream, irand13Stream, irand14Stream, irand15Stream,
						mxcStream );

		/* Third variable */
		/* Generating random numbers */
		generateNumber_GPU( seedStream, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream, irand6Stream, irand7Stream, irand8Stream );
		copy_GPU( irand8Stream, seedStream );
		generateNumber_GPU( seedStream, irand9Stream, irand10Stream, irand11Stream, irand12Stream, irand13Stream, irand14Stream, irand15Stream, irand16Stream );
		copy_GPU( irand16Stream, seedStream );

		/* Generating random matrices with assumed probability */
		MT15bit_GPU( MutP4, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream,
		                irand6Stream, irand7Stream, irand8Stream, irand9Stream, irand10Stream,
		                irand11Stream, irand12Stream, irand13Stream, irand14Stream, irand15Stream,
						mycStream );

		/* Fourth variable */
		/* Generating random numbers */
		generateNumber_GPU( seedStream, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream, irand6Stream, irand7Stream, irand8Stream );
		copy_GPU( irand8Stream, seedStream );
		generateNumber_GPU( seedStream, irand9Stream, irand10Stream, irand11Stream, irand12Stream, irand13Stream, irand14Stream, irand15Stream, irand16Stream );
		copy_GPU( irand16Stream, seedStream );

		/* Generating random matrices with assumed probability */
		MT15bit_GPU( MutP4, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream,
		                irand6Stream, irand7Stream, irand8Stream, irand9Stream, irand10Stream,
		                irand11Stream, irand12Stream, irand13Stream, irand14Stream, irand15Stream,
						msig0Stream );

		/* Mutation */
		mutation_GPU( jxoStream, jxcStream, jycStream, jsig0Stream, mxoStream, mxcStream, mycStream, msig0Stream, ixoStream, ixcStream, iycStream, isig0Stream );
		
		/* Decoding variables (phenotypes) */
		fprintf(infofile,"DEC : ");
		decode_GPU( ixoStream, ixcStream, iycStream, isig0Stream, 
			        maxxo4, minxo4, xoBits4, maxxc4, minxc4, xcBits4,
				    maxyc4, minyc4, ycBits4, maxsig04, minsig04, sig0Bits4,
					xoStream, xcStream, ycStream, sig0Stream );
	
		/* Finding limits of integration */
		printf("LIM : ");
		fprintf(infofile,"LIM : ");
		find_teta_GPU( xoStream, xcStream, ycStream, tanAlfa4, Tetag4, tanFi4, TeStream );

		/* Generating random numbers */
		generateNumber_GPU( seedStream, irand1Stream, irand2Stream, irand3Stream, irand4Stream, irand5Stream, irand6Stream, irand7Stream, irand8Stream );
		copy_GPU( irand8Stream, seedStream );

		/* Evaluating objectives */
		printf("OBJ : ");
		fprintf(infofile,"OBJ : ");
		if( childproc == 1 )
		{
			RTO_GPU( xoStream, xcStream, ycStream, sig0Stream, TeStream, OBJstart4, tanAlfa4, tanBeta4, tanFi4, coh4, gam4, wR4, wT4, wK4, wH4, RStream, TStream, OStream, HStream );
			mix_GPU( maxPop4, ixoStream, ixcStream, iycStream, isig0Stream, RStream, TStream, OStream, HStream, irand1Stream, c1xoStream, c1xcStream, c1ycStream, c1sig0Stream, RC1Stream, TC1Stream, OC1Stream, HC1Stream );
		}
		else
		{
			RTO_GPU( xoStream, xcStream, ycStream, sig0Stream, TeStream, OBJstart4, tanAlfa4, tanBeta4, tanFi4, coh4, gam4, wR4, wT4, wK4, wH4, RStream, TStream, OStream, HStream );
			mix_GPU( maxPop4, ixoStream, ixcStream, iycStream, isig0Stream, RStream, TStream, OStream, HStream, irand2Stream, c2xoStream, c2xcStream, c2ycStream, c2sig0Stream, RC2Stream, TC2Stream, OC2Stream, HC2Stream );
		}
		printf("OK\n");
		fprintf(infofile,"OK\n");
	}//children
	
	printf("              TRN : ");
	fprintf(infofile,"              TRN : ");
	/* Tournaments between children */
	tournament_GPU( OC1Stream, OC2Stream, RC1Stream, RC2Stream, TC1Stream, TC2Stream, HC1Stream, HC2Stream,
		            c1xoStream, c1xcStream, c1ycStream, c1sig0Stream,
		            c2xoStream, c2xcStream, c2ycStream, c2sig0Stream, 
					ixoStream, ixcStream, iycStream, isig0Stream, OStream, RStream, TStream, HStream );

	/* Decoding variables (phenotypes) */
	fprintf(infofile,"DEC : ");
	decode_GPU( ixoStream, ixcStream, iycStream, isig0Stream, 
		        maxxo4, minxo4, xoBits4, maxxc4, minxc4, xcBits4,
			    maxyc4, minyc4, ycBits4, maxsig04, minsig04, sig0Bits4,
				xoStream, xcStream, ycStream, sig0Stream );
	
showinfo:

	finish = clock();

	/* saving all populations - for debugging only */
	//saveall_CPU( iGen, R4, T4, O4, H4, xo4, xc4, yc4, sig04, maxGA, maxPop );
	
	/* Objective statistics */
	fprintf(infofile,"STA : ");
	fill_GPU( Zero4, OsumStream );
	sum_GPU( OStream, OsumStream );
	avg_GPU( OsumStream, maxPop4, OavgStream );
	OavgStream.write( Oavg4 );
	min_max_best_GPU( maxPop, OStream, ixoStream, ixcStream, iycStream, isig0Stream, OminStream, OmaxStream, bestxoStream, bestxcStream, bestycStream, bestsig0Stream ); 
	OminStream.write( Omin4 );
	OmaxStream.write( Omax4 );

	/* Copying results into CPU variables */
	xoStream.write( xo4 );
	xcStream.write( xc4 );
	ycStream.write( yc4 );
	sig0Stream.write( sig04 );
	RStream.write( R4 );
	TStream.write( T4 );
	OStream.write( O4 );
	HStream.write( H4 );

	printf("SAV : ");
	fprintf(infofile,"SAV : ");
	/* saving results & statistics (CPU) */	
	save_CPU( iGen, R4, T4, O4, H4, Oavg4, Omin4, Omax4, xo4, xc4, yc4, sig04, &Hres, &Lres, maxGA, maxPop );

	printf("END\n");
	fprintf(infofile,"END\n");
	printf("              TIM : %.3lf sec.\n\n", (double)(finish - start) / (double)CLOCKS_PER_SEC );
	fprintf(infofile,"              TIM : %.3lf sec.\n", (double)(finish - start) / (double)CLOCKS_PER_SEC );
	fprintf(infofile,"              TIM : %.3lf sec.\n\n", (double)(finish - total) / (double)CLOCKS_PER_SEC );
	printf("Critical height = %.3f m : Range of failure = %.3f m\n\n", Hres, fabsf(Lres));
	fprintf(infofile,"Critical height = %.3f m : Range of failure = %.3f m\n\n", Hres, fabsf(Lres));
	fprintf(timingfile," %10.3f %10.3f %10.3lf\n", Hres, fabsf(Lres), (double)(finish - total) / (double)CLOCKS_PER_SEC );

	if( iGen < maxGen )
	{
		iGen++;
		iRen++;
		start = clock();
		if( algorithm < 0 ) 
		{
			printf("MAK : (%.3d) : ", iGen);
			fprintf(infofile,"MAK : (%.3d) : ", iGen);
			goto restart;
		}
		else 
		{
			if( iRen > intGA )
			{
				printf("MAK : (%.3d) : ", iGen);
				fprintf(infofile,"MAK : (%.3d) : ", iGen);
				iRen = 1u;
				goto restart;
			}
			else
			{
				printf("NEW : (%.3d) : ", iGen);
				fprintf(infofile,"NEW : (%.3d) : ", iGen);
				goto neweval;
			}
		}
	}

	printf("Total time: %.3lf sec.\n\n", (double)(finish - total) / (double)CLOCKS_PER_SEC );
	fprintf(infofile,"Total time: %.3lf sec.\n\n", (double)(finish - total) / (double)CLOCKS_PER_SEC );
	printf("Thank you.\n");
	fprintf(infofile,"Thank you.\n");
	
cleanup://all ends here

	if( infofile )
	{
		if ( fclose( infofile ) )
		{
			printf( "The info file was not closed.\n" );
		}
	}

	if( timingfile )
	{
		if ( fclose( timingfile ) )
		{
			printf( "The timing file was not closed.\n" );
		}
	}

	delete[] nC4;
	delete[] Xr4;
	delete[] Yr4;
	delete[] X4;
	delete[] Y4;
	delete[] Omax4;
	delete[] Omin4;
	delete[] Oavg4;
	delete[] seed4;
	delete[] O4;
	delete[] H4;
    delete[] R4;
    delete[] T4;
    delete[] sig04;
    delete[] yc4;
    delete[] xc4;
    delete[] xo4;

	char c=getchar();

	return 0L;
}
