342 lines
8.6 KiB
C
342 lines
8.6 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* Copyright (c) International Business Machines Corp., 2007, 2008
|
|
*
|
|
* Authors: Darren Hart <dvhltc@us.ibm.com>
|
|
* Dinakar Guniguntala <dino@in.ibm.com>
|
|
*/
|
|
/*\
|
|
* [Description]
|
|
*
|
|
* Compare running sequential matrix multiplication routines
|
|
* to running them in parallel to judge multiprocessor
|
|
* performance
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <math.h>
|
|
#include <librttest.h>
|
|
#include <libstats.h>
|
|
|
|
#define MAX_CPUS 8192
|
|
#define PRIO 43
|
|
#define MATRIX_SIZE 100
|
|
#define DEF_OPS 8 /* the higher the number, the more CPU intensive */
|
|
/* (and therefore SMP performance goes up) */
|
|
#define PASS_CRITERIA 0.75 /* Avg concurrent time * pass criteria < avg seq time - */
|
|
/* for every addition of a cpu */
|
|
#define ITERATIONS 128
|
|
#define HIST_BUCKETS 100
|
|
|
|
#define THREAD_WAIT 1
|
|
#define THREAD_WORK 2
|
|
#define THREAD_DONE 3
|
|
|
|
#define THREAD_SLEEP 1 * NS_PER_US
|
|
|
|
static int ops = DEF_OPS;
|
|
static int numcpus;
|
|
static float criteria;
|
|
static int *tids;
|
|
static int online_cpu_id = -1;
|
|
static int iterations = ITERATIONS;
|
|
static int iterations_percpu;
|
|
|
|
stats_container_t sdat, cdat, *curdat;
|
|
stats_container_t shist, chist;
|
|
static pthread_barrier_t mult_start;
|
|
|
|
struct matrices {
|
|
double A[MATRIX_SIZE][MATRIX_SIZE];
|
|
double B[MATRIX_SIZE][MATRIX_SIZE];
|
|
double C[MATRIX_SIZE][MATRIX_SIZE];
|
|
};
|
|
|
|
static void usage(void)
|
|
{
|
|
rt_help();
|
|
printf("matrix_mult specific options:\n");
|
|
printf
|
|
(" -l# #: number of multiplications per iteration (load)\n");
|
|
printf(" -i# #: number of iterations\n");
|
|
}
|
|
|
|
static int parse_args(int c, char *v)
|
|
{
|
|
int handled = 1;
|
|
switch (c) {
|
|
case 'i':
|
|
iterations = atoi(v);
|
|
break;
|
|
case 'l':
|
|
ops = atoi(v);
|
|
break;
|
|
case 'h':
|
|
usage();
|
|
exit(0);
|
|
default:
|
|
handled = 0;
|
|
break;
|
|
}
|
|
return handled;
|
|
}
|
|
|
|
static void matrix_init(double A[MATRIX_SIZE][MATRIX_SIZE],
|
|
double B[MATRIX_SIZE][MATRIX_SIZE])
|
|
{
|
|
int i, j;
|
|
for (i = 0; i < MATRIX_SIZE; i++) {
|
|
for (j = 0; j < MATRIX_SIZE; j++) {
|
|
A[i][j] = (double)(i * j);
|
|
B[i][j] = (double)((i * j) % 10);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void matrix_mult(struct matrices *matrices)
|
|
{
|
|
int i, j, k;
|
|
|
|
matrix_init(matrices->A, matrices->B);
|
|
for (i = 0; i < MATRIX_SIZE; i++) {
|
|
int i_m = MATRIX_SIZE - i;
|
|
for (j = 0; j < MATRIX_SIZE; j++) {
|
|
double sum = matrices->A[i_m][j] * matrices->B[j][i];
|
|
for (k = 0; k < MATRIX_SIZE; k++)
|
|
sum += matrices->A[i_m][k] * matrices->B[k][j];
|
|
matrices->C[i][j] = sum;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void matrix_mult_record(struct matrices *matrices, int index)
|
|
{
|
|
nsec_t start, end, delta;
|
|
int i;
|
|
|
|
start = rt_gettime();
|
|
for (i = 0; i < ops; i++)
|
|
matrix_mult(matrices);
|
|
end = rt_gettime();
|
|
delta = (long)((end - start) / NS_PER_US);
|
|
curdat->records[index].x = index;
|
|
curdat->records[index].y = delta;
|
|
}
|
|
|
|
static int set_affinity(void)
|
|
{
|
|
static pthread_mutex_t mutex_cpu = PTHREAD_MUTEX_INITIALIZER;
|
|
cpu_set_t mask;
|
|
int cpuid;
|
|
|
|
pthread_mutex_lock(&mutex_cpu);
|
|
do {
|
|
++online_cpu_id;
|
|
CPU_ZERO(&mask);
|
|
CPU_SET(online_cpu_id, &mask);
|
|
|
|
if (!sched_setaffinity(0, sizeof(mask), &mask)) {
|
|
cpuid = online_cpu_id; /* Save this value before unlocking mutex */
|
|
pthread_mutex_unlock(&mutex_cpu);
|
|
return cpuid;
|
|
}
|
|
} while (online_cpu_id < MAX_CPUS);
|
|
pthread_mutex_unlock(&mutex_cpu);
|
|
return -1;
|
|
}
|
|
|
|
static void *concurrent_thread(void *thread)
|
|
{
|
|
struct thread *t = (struct thread *)thread;
|
|
struct matrices *matrices = (struct matrices *) t->arg;
|
|
int thread_id = (intptr_t) t->id;
|
|
int cpuid;
|
|
int i;
|
|
int index;
|
|
|
|
cpuid = set_affinity();
|
|
if (cpuid == -1) {
|
|
fprintf(stderr, "Thread %d: Can't set affinity.\n", thread_id);
|
|
exit(1);
|
|
}
|
|
|
|
index = iterations_percpu * thread_id; /* To avoid stats overlapping */
|
|
pthread_barrier_wait(&mult_start);
|
|
for (i = 0; i < iterations_percpu; i++)
|
|
matrix_mult_record(matrices, index++);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static int main_thread(void)
|
|
{
|
|
int ret, i, j;
|
|
nsec_t start, end;
|
|
long smin = 0, smax = 0, cmin = 0, cmax = 0, delta = 0;
|
|
float savg, cavg;
|
|
int cpuid;
|
|
struct matrices *matrices[numcpus];
|
|
|
|
for (i = 0; i < numcpus; ++i)
|
|
matrices[i] = malloc(sizeof(struct matrices));
|
|
|
|
if (stats_container_init(&sdat, iterations) ||
|
|
stats_container_init(&shist, HIST_BUCKETS) ||
|
|
stats_container_init(&cdat, iterations) ||
|
|
stats_container_init(&chist, HIST_BUCKETS)
|
|
) {
|
|
fprintf(stderr, "Cannot init stats container\n");
|
|
exit(1);
|
|
}
|
|
|
|
tids = calloc(numcpus, sizeof(int));
|
|
if (!tids) {
|
|
perror("malloc");
|
|
exit(1);
|
|
}
|
|
|
|
cpuid = set_affinity();
|
|
if (cpuid == -1) {
|
|
fprintf(stderr, "Main thread: Can't set affinity.\n");
|
|
exit(1);
|
|
}
|
|
|
|
/* run matrix mult operation sequentially */
|
|
curdat = &sdat;
|
|
curdat->index = iterations - 1;
|
|
printf("\nRunning sequential operations\n");
|
|
start = rt_gettime();
|
|
for (i = 0; i < iterations; i++)
|
|
matrix_mult_record(matrices[0], i);
|
|
end = rt_gettime();
|
|
delta = (long)((end - start) / NS_PER_US);
|
|
|
|
savg = delta / iterations; /* don't use the stats record, use the total time recorded */
|
|
smin = stats_min(&sdat);
|
|
smax = stats_max(&sdat);
|
|
|
|
printf("Min: %ld us\n", smin);
|
|
printf("Max: %ld us\n", smax);
|
|
printf("Avg: %.4f us\n", savg);
|
|
printf("StdDev: %.4f us\n", stats_stddev(&sdat));
|
|
|
|
if (stats_hist(&shist, &sdat) ||
|
|
stats_container_save("sequential",
|
|
"Matrix Multiplication Sequential Execution Runtime Scatter Plot",
|
|
"Iteration", "Runtime (us)", &sdat, "points")
|
|
|| stats_container_save("sequential_hist",
|
|
"Matrix Multiplicatoin Sequential Execution Runtime Histogram",
|
|
"Runtime (us)", "Samples", &shist, "steps")
|
|
) {
|
|
fprintf(stderr,
|
|
"Warning: could not save sequential mults stats\n");
|
|
}
|
|
|
|
pthread_barrier_init(&mult_start, NULL, numcpus + 1);
|
|
set_priority(PRIO);
|
|
curdat = &cdat;
|
|
curdat->index = iterations - 1;
|
|
online_cpu_id = -1; /* Redispatch cpus */
|
|
/* Create numcpus-1 concurrent threads */
|
|
for (j = 0; j < numcpus; j++) {
|
|
tids[j] = create_fifo_thread(concurrent_thread, matrices[j], PRIO);
|
|
if (tids[j] == -1) {
|
|
printf
|
|
("Thread creation failed (max threads exceeded?)\n");
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
/* run matrix mult operation concurrently */
|
|
printf("\nRunning concurrent operations\n");
|
|
pthread_barrier_wait(&mult_start);
|
|
start = rt_gettime();
|
|
join_threads();
|
|
end = rt_gettime();
|
|
|
|
delta = (long)((end - start) / NS_PER_US);
|
|
|
|
cavg = delta / iterations; /* don't use the stats record, use the total time recorded */
|
|
cmin = stats_min(&cdat);
|
|
cmax = stats_max(&cdat);
|
|
|
|
printf("Min: %ld us\n", cmin);
|
|
printf("Max: %ld us\n", cmax);
|
|
printf("Avg: %.4f us\n", cavg);
|
|
printf("StdDev: %.4f us\n", stats_stddev(&cdat));
|
|
|
|
if (stats_hist(&chist, &cdat) ||
|
|
stats_container_save("concurrent",
|
|
"Matrix Multiplication Concurrent Execution Runtime Scatter Plot",
|
|
"Iteration", "Runtime (us)", &cdat, "points")
|
|
|| stats_container_save("concurrent_hist",
|
|
"Matrix Multiplication Concurrent Execution Runtime Histogram",
|
|
"Iteration", "Runtime (us)", &chist,
|
|
"steps")
|
|
) {
|
|
fprintf(stderr,
|
|
"Warning: could not save concurrent mults stats\n");
|
|
}
|
|
|
|
printf("\nConcurrent Multipliers:\n");
|
|
printf("Min: %.4f\n", (float)smin / cmin);
|
|
printf("Max: %.4f\n", (float)smax / cmax);
|
|
printf("Avg: %.4f\n", (float)savg / cavg);
|
|
|
|
ret = 1;
|
|
if (savg > (cavg * criteria))
|
|
ret = 0;
|
|
printf
|
|
("\nCriteria: %.2f * average concurrent time < average sequential time\n",
|
|
criteria);
|
|
printf("Result: %s\n", ret ? "FAIL" : "PASS");
|
|
|
|
for (i = 0; i < numcpus; i++)
|
|
free(matrices[i]);
|
|
|
|
return ret;
|
|
}
|
|
|
|
int main(int argc, char *argv[])
|
|
{
|
|
setup();
|
|
pass_criteria = PASS_CRITERIA;
|
|
rt_init("l:i:h", parse_args, argc, argv);
|
|
numcpus = sysconf(_SC_NPROCESSORS_ONLN);
|
|
/* the minimum avg concurrent multiplier to pass */
|
|
criteria = pass_criteria * numcpus;
|
|
int new_iterations, ret;
|
|
|
|
if (iterations <= 0) {
|
|
fprintf(stderr, "iterations must be greater than zero\n");
|
|
exit(1);
|
|
}
|
|
|
|
printf("\n---------------------------------------\n");
|
|
printf("Matrix Multiplication (SMP Performance)\n");
|
|
printf("---------------------------------------\n\n");
|
|
|
|
/* Line below rounds up iterations to a multiple of numcpus.
|
|
* Without this, having iterations not a mutiple of numcpus causes
|
|
* stats to segfault (overflow stats array).
|
|
*/
|
|
new_iterations = (int)((iterations + numcpus - 1) / numcpus) * numcpus;
|
|
if (new_iterations != iterations)
|
|
printf
|
|
("Rounding up iterations value to nearest multiple of total online CPUs\n");
|
|
|
|
iterations = new_iterations;
|
|
iterations_percpu = iterations / numcpus;
|
|
|
|
printf("Running %d iterations\n", iterations);
|
|
printf("Matrix Dimensions: %dx%d\n", MATRIX_SIZE, MATRIX_SIZE);
|
|
printf("Calculations per iteration: %d\n", ops);
|
|
printf("Number of CPUs: %u\n", numcpus);
|
|
|
|
set_priority(PRIO);
|
|
ret = main_thread();
|
|
|
|
return ret;
|
|
}
|