cuda_mandelbrot/mandelbrot.cu

// Mandelbrot with CUDA
//
// Author: Axel Huebl (Serial Code by Matze)
// Date:   10th Jan 2012
//

#include <stdio.h>
#include <math.h>
#include <complex.h>
#include "cuda.h"

// simulation parameters
const int max_iterations = 255;
const int num_cols = 400;
const int num_rows = 300;

// cuda parameters
size_t blocksize = 256; // threads per block
const int maxRam = 250; // ION has approx. 256 MB global RAM


// Complex Numbers
struct cuComplex {
  float r;
  float i;
  __device__ cuComplex( float a, float b ) : r(a), i(b) {}
  __device__ float magnitude2( void ) {
    return r * r + i * i;
  }
  __device__ cuComplex operator*(const cuComplex& a) {
    return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
  }
  __device__ cuComplex operator*(const float& a) {
    return cuComplex(r*a, i*a);
  }
  __device__ cuComplex operator+(const cuComplex& a) {
    return cuComplex(r+a.r, i+a.i);
  }
  __device__ cuComplex operator+(const float& a) {
    return cuComplex(r+a, i);
  }
};

__device__ int iterate(cuComplex c ) {
        cuComplex z(0., 0.);
        int iterations = 0;
        for( int i=0; i<max_iterations; i++ ) {
                z = z*z + c;
                if( sqrtf(z.magnitude2() ) > 2.0f )
                        break;
                else
                        ++iterations;
        }
        return iterations;
}

__global__ void calcMandelbrot( int* color_d, const int num_rows, const int num_cols ) {
   
    const int globalX = ( blockIdx.x * blockDim.x ) + threadIdx.x;
    const int globalY = blockIdx.y;
    const int offset  = globalY * num_cols + globalX;

    // parameters
    const float c_rmin = -2.0;
    const float c_rmax = +1.0;
    const float c_imin = -1.0;
    const float c_imax = +1.0;

    const float dx = (c_rmax - c_rmin) / float(num_cols);
    const float dy = (c_imax - c_imin) / float(num_rows);

    cuComplex imaginary( 0., 1.);

    if( globalY < num_rows && globalX < num_cols ) {
      cuComplex c = ( imaginary*( c_imin+(float(globalY)*dy) ) ) + (c_rmin+(float(globalX)*dx));
      color_d[offset] = iterate(c);
    }

}

int main() {

	FILE *output = fopen("mandelbrot.ppm", "w+b");

        int *color_h, *color_d;
        const int nBytes = num_rows*num_cols*sizeof(int);

        const int globalMem = nBytes / 1024 / 1024; // in MiB
        printf( "Will use %d MiB of global Memory...\n", globalMem );

        if( globalMem > maxRam ) {
           printf( "Maximum RAM is %d ... exit now...\n", maxRam);
           return 1;
        }

        // allocate host memory
        color_h = (int*)malloc(nBytes);
        // allocate device memory
        cudaMalloc( (void**)&color_d, nBytes );

        // init host
        for( int i=0; i<num_cols*num_rows; i++ ) color_h[i] = 0;
        // copy to device
        cudaMemcpy(color_d, color_h, nBytes, cudaMemcpyHostToDevice);
        printf( "Copied Memory to Device...\n" );
        
        // call kernel
        // dimension and size of grid *in blocks* (2D)
        dim3 grid( ceil( double(num_cols)/double(blocksize) ), num_rows );
        printf( "Grid size in blocks: %d %d\n", grid.x, grid.y );
        // dimension and size of blocks *in threads* (3D)
        dim3 threads( blocksize );

        // asynchroner (!!) funktionsaufruf!
        calcMandelbrot<<<grid, threads>>>( color_d, num_rows, num_cols );
        printf( "%s\n", cudaGetErrorString( cudaGetLastError() ) );


        // copy to host
        cudaMemcpy(color_h, color_d, nBytes, cudaMemcpyDeviceToHost);
        printf( "Copied Memory back to Host...\n" );


	fprintf(output, "P3\n");
	fprintf(output, "%d %d\n%d\n\n", num_cols, num_rows, max_iterations);

	for (int x=0; x<num_cols; x++) {
		for (int y=0; y<num_rows; y++) {
			// float complex imaginary = 0+1.0i;
			// c = (c_rmin+(x*dx)) + ((c_imin+(y*dy))*imaginary);

			// color = iterate(c);
			fprintf(output, "%d\n", color_h[x*num_rows + y]);
			fprintf(output, "%d\n", color_h[x*num_rows + y]);
			fprintf(output, "%d\n\n", color_h[x*num_rows + y]);
		}
	}

	fclose(output);

        // free host memory
        free(color_h);
        // free devide memory
        cudaFree(color_d);

	return 0;
}