Description of gpuBench

0001 function [outGPU,outHost] = gpuBench()
0002 %GPUBENCH  MATLAB GPU Benchmark
0003 %   GPUBENCH times different MATLAB GPU tasks and compares the execution
0004 %   speed with the speed of several other GPUs.  The tasks are:
0005 %
0006 %    Backslash   Matrix left-division.    Floating point, regular memory access.
0007 %    MTimes      Matrix multiplication.   Floating point, regular memory access.
0008 %    FFT         Fast Fourier Transform.  Floating point, irregular memory access.
0009 %
0010 %   Each task is run for a range of array sizes and the results are tabulated
0011 %   in an HTML report.  GPUBENCH can take several minutes to complete - please
0012 %   be patient! Note that if your GPU is also driving your monitor then
0013 %   the display may become unresponsive during testing.
0014 %
0015 %   GPUBENCH runs each of the tasks and shows a report indicating how the
0016 %   current GPU compares to other systems.
0017 %
0018 %   T = GPUBENCH returns a data structure containing all of the results and
0019 %   does not generate the report.
0020 %
0021 %   Fluctuations of up to ten percent in the measured times of repeated
0022 %   runs on a single machine are not uncommon.  Your own mileage may vary.
0023 %
0024 %   This benchmark is intended to compare performance different GPUs on one
0025 %   particular version of MATLAB.  It does not offer direct comparisons
0026 %   between different versions of MATLAB.
0027 %
0028 %   See also: BENCH, gpuBenchReport
0029 
0030 % Unused tasks:
0031 %    Mandelbrot  Calculate a Mandelbrot Set.  Floating point, regular memory access.
0032 
0033 %   Author: Ben Tordoff
0034 %   Copyright 2011-2012 The MathWorks, Inc.
0035 
0036 % Check for the right MATLAB version and availability of PCT
0037 gpubench.checkMATLABVersion();
0038 gpubench.checkPCT();
0039 
0040 % Check for a GPU. We give the option of running without a GPU so that
0041 % users can evaluate what benefits a GPU might give.
0042 hasGPU = parallel.gpu.GPUDevice.isAvailable();
0043 if ~hasGPU
0044     title = 'Continue without a GPU?';
0045     question = ['The GPU could not be used. ' ...
0046         'Do you wish to continue and collect results for your CPU?'];
0047     buttons = {'Collect CPU results', 'Stop'};
0048     answer = questdlg(question, title, buttons{:}, buttons{end});
0049     if ~strcmp(answer,buttons{1})
0050         warning( 'GPUBench:NoGPU', 'No GPU was available for GPUBench to use.' );
0051         return;
0052     end
0053 end
0054 
0055 % Initialize the data object
0056 release = regexp( version, 'R\d*[ab]', 'match' );
0057 gpuData = gpubench.PerformanceData( ...
0058     release{1}, ...
0059     gpubench.cpuinfo(), ...
0060     gpubench.gpuinfo(), ...
0061     now() );
0062 hostData = gpubench.PerformanceData( ...
0063     release{1}, ...
0064     gpubench.cpuinfo(), ...
0065     struct(), ...
0066     now() );
0067 hostData.IsHostData = true;
0068 
0069 % Do we need to measure the host stuff?
0070 doHost = (nargout~=1);
0071 numTasks = 6*(hasGPU+doHost);
0072 reps = 3;
0073 progressTitle = 'Running GPUBench...';
0074 gpubench.multiWaitbar( progressTitle, 0 );
0075 
0076 if hasGPU
0077     gpuData = runBackslash( gpuData, reps, 'single', 'GPU', progressTitle, numTasks );
0078     gpuData = runBackslash( gpuData, reps, 'double', 'GPU', progressTitle, numTasks );
0079 
0080     gpuData = runMTimes( gpuData, reps, 'single', 'GPU', progressTitle, numTasks );
0081     gpuData = runMTimes( gpuData, reps, 'double', 'GPU', progressTitle, numTasks );
0082 
0083     gpuData = runFFT( gpuData, reps, 'single', 'GPU', progressTitle, numTasks );
0084     gpuData = runFFT( gpuData, reps, 'double', 'GPU', progressTitle, numTasks );
0085 
0086     % gpuData = runMandelbrot( gpuData, reps, 'double', 'GPU', progressTitle, numTasks );
0087 end
0088 
0089 if doHost
0090     hostData = runBackslash( hostData, reps, 'single', 'Host', progressTitle, numTasks );
0091     hostData = runBackslash( hostData, reps, 'double', 'Host', progressTitle, numTasks );
0092     
0093     hostData = runMTimes( hostData, reps, 'single', 'Host', progressTitle, numTasks );
0094     hostData = runMTimes( hostData, reps, 'double', 'Host', progressTitle, numTasks );
0095     
0096     hostData = runFFT( hostData, reps, 'single', 'Host', progressTitle, numTasks );
0097     hostData = runFFT( hostData, reps, 'double', 'Host', progressTitle, numTasks );
0098     
0099     % hostData = runMandelbrot( hostData, reps, 'double', 'Host', progressTitle, numTasks );
0100 end
0101 
0102 gpubench.multiWaitbar( progressTitle, 'Close' );
0103 
0104 if nargout
0105     % User requested raw data
0106     outGPU = gpuData;
0107     outHost = hostData;
0108 else
0109     % Produce report
0110     reportData = {};
0111     if hasGPU
0112         reportData{end+1} = gpuData;
0113     end
0114     if doHost
0115         reportData{end+1} = hostData;
0116     end
0117     web( gpuBenchReport( reportData{:} ) );
0118 end
0119 
0120 
0121 %-------------------------------------------------------------------------%
0122 function data = runFFT( data, reps, type, device, mainProgressTitle, numTasks )
0123 % Work out the maximum size we should run
0124 safetyFactor = 6; % Based on trial and error. Requiring 6x the input seems safe.
0125 sizes = getTestSizes( type, safetyFactor, device );
0126 times = inf( size( sizes ) );
0127 worstTime = 0;
0128 
0129 progressTitle = sprintf( 'FFT (%s, %s)', device, type );
0130 progressTotal = sum(sizes);
0131 gpubench.multiWaitbar( progressTitle, 0 );
0132 
0133 for ii=1:numel(sizes)
0134     % Check for getting close to time-out
0135     if tooCloseToTimeout( worstTime, device )
0136         %fprintf( 'Skipping FFT of size %u to prevent timeout.\n', sizes(ii) );
0137         times(ii) = nan;
0138         continue;
0139     end        
0140     N = sizes(ii);
0141     try
0142         A = complex( rand( N, 1, type ), rand( N, 1, type ) );
0143         if strcmpi( device, 'GPU' )
0144             A = gpuArray(A);
0145         end
0146         
0147         for rr=1:reps
0148             t = tic();
0149             B = fft(A); %#ok<NASGU>
0150             elapsedTime = gtoc(t);
0151             times(ii) = min( times(ii), elapsedTime );
0152             worstTime = max( worstTime, elapsedTime );
0153             clear B;
0154             % Update both progress bars
0155             inc = sizes(ii)/(reps*progressTotal);
0156             gpubench.multiWaitbar( progressTitle, 'Increment', inc );
0157             gpubench.multiWaitbar( mainProgressTitle, 'Increment', inc/numTasks );
0158         end
0159     catch err %#ok<NASGU>
0160         %fprintf( 'discarded FFT of size %u.\n', N );
0161         times(ii) = nan;
0162     end
0163 end
0164 gpubench.multiWaitbar( progressTitle, 'Close' );
0165 
0166 % Clear any dud results
0167 sizes(isnan( times )) = [];
0168 times(isnan( times )) = [];
0169 
0170 data = addResult( data, 'FFT', type, sizes, 5*sizes.*log2(sizes), times );
0171 
0172 
0173 %-------------------------------------------------------------------------%
0174 function data = runMTimes( data, reps, type, device, mainProgressTitle, numTasks )
0175 safetyFactor = 3.5; % Space for two inputs plus one output and a bit to spare
0176 sizes = getTestSizes( type, safetyFactor, device );
0177 
0178 times = inf( size( sizes ) );
0179 worstTime = 0;
0180 
0181 progressTitle = sprintf( 'MTimes (%s, %s)', device, type );
0182 progressTotal = sum(sizes);
0183 gpubench.multiWaitbar( progressTitle, 0 );
0184 
0185 N = round( sqrt( sizes ) );
0186 for ii=1:numel(sizes)
0187     % Check for getting close to time-out
0188     if tooCloseToTimeout( worstTime, device )
0189         %fprintf( 'Skipping MTimes of %ux%u to prevent timeout.\n', N(ii), N(ii) );
0190         times(ii) = nan;
0191         continue;
0192     end        
0193     
0194     try
0195         A = rand( N(ii), N(ii), type );
0196         B = rand( N(ii), N(ii), type );
0197         if strcmpi( device, 'GPU' )
0198             A = gpuArray(A);
0199             B = gpuArray(B);
0200         end
0201         for rr=1:reps
0202             t = tic();
0203             C = A*B; %#ok<NASGU>
0204             elapsedTime = gtoc(t);
0205             times(ii) = min( times(ii), elapsedTime );
0206             worstTime = max( worstTime, elapsedTime );
0207             clear C;
0208             % Update both progress bars
0209             inc = sizes(ii)/(reps*progressTotal);
0210             gpubench.multiWaitbar( progressTitle, 'Increment', inc );
0211             gpubench.multiWaitbar( mainProgressTitle, 'Increment', inc/numTasks );
0212         end
0213     catch err %#ok<NASGU>
0214         %fprintf( 'discarded MTimes of %ux%u.\n', N(ii), N(ii) );
0215         times(ii) = nan;
0216     end
0217 end
0218 gpubench.multiWaitbar( progressTitle, 'Close' );
0219 
0220 % Clear any dud results
0221 N(isnan( times )) = [];
0222 times(isnan( times )) = [];
0223 
0224 data = addResult( data, 'MTimes', type, N.*N, N.*N.*(2.*N-1), times );
0225 
0226 
0227 
0228 %-------------------------------------------------------------------------%
0229 function data = runBackslash( data, reps, type, device, mainProgressTitle, numTasks )
0230 safetyFactor = 1.5; % One full-sized matrix plus two vectors, so 1.5 is plenty
0231 sizes = getTestSizes( type, safetyFactor, device );
0232 
0233 % Limit the sizes to 1e8 for now to prevent problems
0234 sizes(sizes>1e8) = [];
0235 
0236 times = inf( size( sizes ) );
0237 worstTime = 0;
0238 
0239 progressTitle = sprintf( 'Backslash (%s, %s)', device, type );
0240 progressTotal = sum(sizes);
0241 gpubench.multiWaitbar( progressTitle, 0 );
0242 
0243 N = round( sqrt( sizes ) );
0244 for ii=1:numel(sizes)
0245     % Check for getting close to time-out
0246     if tooCloseToTimeout( worstTime, device )
0247         %fprintf( 'Skipping Backslash of %ux%u to prevent timeout.\n', N(ii), N(ii) );
0248         times(ii) = nan;
0249         continue;
0250     end        
0251     try
0252         A = 100*eye( N(ii), N(ii), type ) + rand( N(ii), N(ii), type );
0253         b = rand( N(ii), 1, type );
0254         if strcmpi( device, 'GPU' )
0255             A = gpuArray(A);
0256             b = gpuArray(b);
0257         end
0258         for rr=1:reps
0259             t = tic();
0260             C = A\b; %#ok<NASGU>
0261             elapsedTime = gtoc(t);
0262             times(ii) = min( times(ii), elapsedTime );
0263             worstTime = max( worstTime, elapsedTime );
0264             clear C;
0265             % Update both progress bars
0266             inc = sizes(ii)/(reps*progressTotal);
0267             gpubench.multiWaitbar( progressTitle, 'Increment', inc );
0268             gpubench.multiWaitbar( mainProgressTitle, 'Increment', inc/numTasks );
0269         end
0270 
0271     catch err %#ok<NASGU>
0272         %fprintf( 'discarded Backslash of %ux%u.\n', N(ii), N(ii) );
0273         times(ii) = nan;
0274     end
0275 end
0276 gpubench.multiWaitbar( progressTitle, 'Close' );
0277 
0278 % Clear any dud results
0279 N(isnan( times )) = [];
0280 times(isnan( times )) = [];
0281 
0282 data = addResult( data, 'Backslash', type, N.*N, round(2/3*N.^3 + 3/2*N.^2), times );
0283 
0284 
0285 %-------------------------------------------------------------------------%
0286 function data = runMandelbrot( data, reps, type, device, mainProgressTitle, numTasks ) %#ok<DEFNU>
0287 % This task will only run on the GPU
0288 safetyFactor = 3;
0289 sizes = getTestSizes( type, safetyFactor, device );
0290 
0291 times = inf( size( sizes ) );
0292 worstTime = 0;
0293 numops = inf( size( sizes ) );
0294 maxIterations = 200;
0295 xlim = [-2, 0.5];
0296 ylim = [ -1.25,  1.25];
0297 
0298 progressTitle = sprintf( 'Mandelbrot (%s, %s)', type, device );
0299 progressTotal = sum(sizes);
0300 gpubench.multiWaitbar( progressTitle, 0 );
0301 
0302 for ii=1:numel(sizes)
0303     gridSize = round( sqrt( sizes(ii) ) );
0304     % Check for getting close to time-out
0305     if tooCloseToTimeout( worstTime, device )
0306         %fprintf( 'Skipping Mandelbrot of size %ux%u to prevent timeout.\n', gridSize, gridSize );
0307         times(ii) = nan;
0308         continue;
0309     end        
0310     if strcmpi( device, 'GPU' )
0311         try
0312             
0313             x = parallel.gpu.GPUArray.linspace( xlim(1), xlim(2), gridSize ); %#ok<GPUARB>
0314             y = parallel.gpu.GPUArray.linspace( ylim(1), ylim(2), gridSize ); %#ok<GPUARB>
0315             [xGrid,yGrid] = meshgrid( x, y );
0316             
0317             % Calculate
0318             for rr=1:reps
0319                 t = tic();
0320                 count = arrayfun( @processMandelbrotElement, xGrid, yGrid, maxIterations );
0321                 elapsedTime = gtoc(t);
0322                 times(ii) = min( times(ii), elapsedTime );
0323                 worstTime = max( worstTime, elapsedTime );
0324             end
0325             % Use the count to work out the number of operations
0326             % Each iteration of a single element requires:
0327             %  * abs(complex) = 3 flop
0328             %  * count+1 = 1 flop
0329             %  * z*z + z0 = 8 flop
0330             numops(ii) = gather( sum(count(:)*12) );
0331             
0332             clear count;
0333             % Update both progress bars
0334             inc = sizes(ii)/(reps*progressTotal);
0335             gpubench.multiWaitbar( progressTitle, 'Increment', inc );
0336             gpubench.multiWaitbar( mainProgressTitle, 'Increment', inc/numTasks );
0337             
0338         catch err %#ok<NASGU>
0339             %fprintf( 'discarded Mandelbrot of %ux%u.\n', gridSize, gridSize );
0340             times(ii) = nan;
0341         end
0342         
0343     else
0344         % Host version. This takes too long to do several repeats so we
0345         % just run it once.
0346         x = linspace( xlim(1), xlim(2), gridSize );
0347         y = linspace( ylim(1), ylim(2), gridSize );
0348         [xGrid,yGrid] = meshgrid( x, y );
0349         z0 = complex(xGrid,yGrid);
0350         t = tic();
0351         z = z0;
0352         count = zeros(size(z));
0353         for n = 1:maxIterations
0354             inside = ((real(z).^2 + imag(z).^2) <= 4);
0355             count = count + inside;
0356             z = z.*z + z0;
0357         end
0358         times(ii) = toc(t);
0359         % Each iteration of a single element requires:
0360         %  * inside check = 3 flop
0361         %  * count+1 = 1 flop
0362         %  * z*z + z0 = 8 flop
0363         % Since every element does the same amount of work in this
0364         % version, the operation count is simply 12*numel*maxIters
0365         numops(ii) = 12*numel(z)*maxIterations;
0366         
0367         % Update both progress bars
0368         gpubench.multiWaitbar( progressTitle, 'Increment', sizes(ii)/progressTotal );
0369         gpubench.multiWaitbar( mainProgressTitle, 'Increment', (sizes(ii)/progressTotal)/numTasks );
0370     end
0371 end
0372 gpubench.multiWaitbar( progressTitle, 'Close' );
0373 
0374 % Clear any dud results
0375 sizes(isnan( times )) = [];
0376 numops(isnan( times )) = [];
0377 times(isnan( times )) = [];
0378 
0379 data = addResult( data, 'Mandelbrot', type, sizes, numops, times );
0380 
0381 %-------------------------------------------------------------------------%
0382 function elapsedTime = gtoc( timer )
0383 % Wait for GPU operations to complete the call toc
0384 persistent hasWait;
0385 if isempty(hasWait)
0386     try
0387         wait(gpuDevice);
0388         hasWait = true;
0389     catch err %#ok<NASGU>
0390         hasWait = false;
0391     end
0392 elseif hasWait
0393     wait(gpuDevice);
0394 end
0395 elapsedTime = toc(timer);
0396 
0397 %-------------------------------------------------------------------------%
0398 function sizes = getTestSizes( type, safetyFactor, device )
0399 % Return the maximum number of elements that will fit in the device memory
0400 elementSize = gpubench.sizeof( type );
0401 if strcmpi( device, 'Host' )
0402     % On the host everything takes longer, so don't go as far
0403     safetyFactor = safetyFactor*2;
0404 end
0405 
0406 % Use as much memory as we can.
0407 if parallel.gpu.GPUDevice.isAvailable()
0408     gpu = gpuDevice();
0409     freeMem = gpu.FreeMemory;
0410 else
0411     % No GPU to get memory size, so just go for 4GB
0412     freeMem = 4*2^30;
0413 end
0414 maxNumElements = floor( freeMem / (elementSize*safetyFactor) );
0415 if isnan( maxNumElements ) || maxNumElements < 1e6
0416     error( 'gpuBench:NotEnoughMemory', 'Not enough free device memory to run tasks' );
0417 end
0418 
0419 % We want powers of two up to this size
0420 maxPower = floor( log2( maxNumElements ) );
0421 sizes = power( 2, 10:2:maxPower );
0422 
0423 
0424 %-------------------------------------------------------------------------%
0425 function stopNow = tooCloseToTimeout( time, device )
0426 % Shoulkd a test stop early to avoid triggering the device time-out?
0427 stopNow = false;
0428 if strcmpi( device, 'Host' )
0429     % On the host there is no time limit
0430 else
0431     gpu = gpuDevice();
0432     % If the kernel has a timeout it is typically 2-5 seconds. If we have
0433     % just done a size that takes more than a quarter of a second, the next
0434     % size will likely trigger the timeout.
0435     stopNow = (gpu.KernelExecutionTimeout && time>0.25);
0436 end
0437 
0438 
0439 
0440 %-------------------------------------------------------------------------%
0441 function count = processMandelbrotElement(x0,y0,maxIterations)
0442 % Evaluate the Mandelbrot function for a single element
0443 %
0444 %   m = processMandelbrotElement(x0,y0,maxIterations) evaluates the
0445 %   number of steps before the complex value (x0,y0) jumps outside a circle
0446 %   of radius two on the complex plane. Each iteration involves mapping
0447 %   z=z^2+z0 where z0=x0+i*y0. The return value is the log of the
0448 %   iteration count at escape or maxIterations if the point did not escape.
0449 z0 = complex(x0,y0);
0450 z = z0;
0451 count = 0;
0452 while (count < maxIterations) ...
0453         && ((real(z)*real(z) + imag(z)*imag(z)) <= 4)
0454     count = count + 1;
0455     z = z*z + z0;
0456 end
gpuBench

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SUBFUNCTIONS

SOURCE CODE