Example: LU factorization
Objectives
Apply StarPU to a real algorithm.
Exercise
Parallize the finely-blocked LU factorization algorithm using StarPU.
Solution
1#include <stdio.h>
2#include <stdlib.h>
3#include <time.h>
4#include <starpu.h>
5
6extern double dnrm2_(int const *, double const *, int const *);
7
8extern void dtrmm_(char const *, char const *, char const *, char const *,
9 int const *, int const *, double const *, double const *, int const *,
10 double *, int const *);
11
12extern void dlacpy_(char const *, int const *, int const *, double const *,
13 int const *, double *, int const *);
14
15extern double dlange_(char const *, int const *, int const *, double const *,
16 int const *, double *);
17
18extern void dtrsm_(char const *, char const *, char const *, char const *,
19 int const *, int const *, double const *, double const *, int const *,
20 double *, int const *);
21
22extern void dgemm_(char const *, char const *, int const *, int const *,
23 int const *, double const *, double const *, int const *, double const *,
24 int const *, double const *, double *, int const *);
25
26double one = 1.0;
27double minus_one = -1.0;
28
29// returns the ceil of a / b
30int DIVCEIL(int a, int b)
31{
32 return (a+b-1)/b;
33}
34
35// returns the minimum of a and b
36int MIN(int a, int b)
37{
38 return a < b ? a : b;
39}
40
41// returns the maxinum of a and b
42int MAX(int a, int b)
43{
44 return a > b ? a : b;
45}
46
47void simple_lu(int n, int ldA, double *A)
48{
49 for (int i = 0; i < n; i++) {
50 for (int j = i+1; j < n; j++) {
51 A[i*ldA+j] /= A[i*ldA+i];
52
53 for (int k = i+1; k < n; k++)
54 A[k*ldA+j] -= A[i*ldA+j] * A[k*ldA+i];
55 }
56 }
57}
58
59////////////////////////////////////////////////////////////////////////////
60////////////////////////////////////////////////////////////////////////////
61////////////////////////////////////////////////////////////////////////////
62
63//
64// Kernel implementations:
65//
66// Each implementation must have the following prototype:
67// void (*func)(void *buffers[], void *args);
68//
69// The 'buffers' argument encapsulates the input and output buffers. The 'args'
70// argument encapsulates optional static arguments.
71//
72
73// a CPU implementation for the kernel that computes a small LU decomposition
74static void small_lu(void *buffers[], void *args)
75{
76 // In this case the kernel has a single input and output buffer. The buffer
77 // is accessible through a matrix interface.
78 struct starpu_matrix_interface *A_i =
79 (struct starpu_matrix_interface *)buffers[0];
80
81 // we can now extract the relevant information from the interface
82 double *ptr = (double *) STARPU_MATRIX_GET_PTR(A_i); // pointer
83 const int n = STARPU_MATRIX_GET_NX(A_i); // matrix dimension
84 const int ld = STARPU_MATRIX_GET_LD(A_i); // leading dimension
85
86 // The runtime system guarantees that the data resides in the device memory
87 // (main memory in this case). Thus, we can call the simple_lu function to
88 // perform the actual computations.
89 simple_lu(n, ld, ptr);
90}
91
92// a CPU implementation for the kernel that performs a block row/column update
93static void rc_update(void *buffers[], void *args)
94{
95 // The first four dtrsm arguments are passed as statix arguments. This
96 // allows us to use the same codelet to perform the block row and block
97 // column updates.
98 char side, uplo, transa, diag;
99 starpu_codelet_unpack_args(args, &side, &uplo, &transa, &diag);
100
101 // This time we have two buffers:
102 // 0 = a small LU decomposition that corresponds to the diagonal block
103 // 1 = current row/column block
104 //
105 // Note that we do not have define the interface explicitly.
106
107 dtrsm_(&side, &uplo, &transa, &diag,
108 (int *)&STARPU_MATRIX_GET_NX(buffers[1]),
109 (int *)&STARPU_MATRIX_GET_NY(buffers[1]),
110 &one,
111 (double *)STARPU_MATRIX_GET_PTR(buffers[0]),
112 (int *)&STARPU_MATRIX_GET_LD(buffers[0]),
113 (double *)STARPU_MATRIX_GET_PTR(buffers[1]),
114 (int *)&STARPU_MATRIX_GET_LD(buffers[1]));
115
116}
117
118// a CPU implementation for the kernel that performs a trailing matrix update
119static void trail_update(void *buffers[], void *args)
120{
121 // This time we have three buffers:
122 // 0 = corresponding column block
123 // 1 = corresponding row block
124 // 2 = current trailing matrix block
125
126 dgemm_("No Transpose", "No Transpose",
127 (int *)&STARPU_MATRIX_GET_NX(buffers[2]),
128 (int *)&STARPU_MATRIX_GET_NY(buffers[2]),
129 (int *)&STARPU_MATRIX_GET_NY(buffers[0]),
130 &minus_one,
131 (double *)STARPU_MATRIX_GET_PTR(buffers[0]),
132 (int *)&STARPU_MATRIX_GET_LD(buffers[0]),
133 (double *)STARPU_MATRIX_GET_PTR(buffers[1]),
134 (int *)&STARPU_MATRIX_GET_LD(buffers[1]),
135 &one,
136 (double *)STARPU_MATRIX_GET_PTR(buffers[2]),
137 (int *)&STARPU_MATRIX_GET_LD(buffers[2]));
138}
139
140//
141// Codelets
142//
143// A codelet encapsulates the various implementations of a computational
144// kernel.
145//
146
147// a codelet that computes a small LU decomposition
148static struct starpu_codelet small_lu_cl = {
149 .name = "small_lu", // codelet name
150 .cpu_funcs = { small_lu }, // pointers to the CPU implementations
151 .nbuffers = 1, // buffer count
152 .modes = { STARPU_RW } // buffer access modes (read-write)
153};
154
155// a codelet that that performs a block row/column update
156static struct starpu_codelet rc_update_cl = {
157 .name = "rc_update",
158 .cpu_funcs = { rc_update },
159 .nbuffers = 2,
160 .modes = { STARPU_R, STARPU_RW } // read-only, read-write
161};
162
163// a codelet that performs a trailing matrix update
164static struct starpu_codelet trail_update_cl = {
165 .name = "trail_update",
166 .cpu_funcs = { trail_update },
167 .nbuffers = 3,
168 .modes = { STARPU_R, STARPU_R, STARPU_RW }
169};
170
171void blocked_lu(int block_size, int n, int ldA, double *A)
172{
173 const int block_count = DIVCEIL(n, block_size);
174
175 // initialize StarPU
176 int ret = starpu_init(NULL);
177
178 if (ret != 0)
179 return;
180
181 // Each buffer that is to be passed to a task must be encapsulated inside a
182 // data handle. This means that we must allocate and fill an array that
183 // stores the block handles.
184
185 starpu_data_handle_t **blocks =
186 malloc(block_count*sizeof(starpu_data_handle_t *));
187
188 for (int i = 0; i < block_count; i++) {
189 blocks[i] = malloc(block_count*sizeof(starpu_data_handle_t));
190
191 for (int j = 0; j < block_count; j++) {
192 // each block is registered as a matrix
193 starpu_matrix_data_register(
194 &blocks[i][j], // handle
195 STARPU_MAIN_RAM, // memory node
196 (uintptr_t)(A+(j*ldA+i)*block_size), // pointer
197 ldA, // leading dimension
198 MIN(block_size, n-i*block_size), // row count
199 MIN(block_size, n-j*block_size), // column count
200 sizeof(double)); // element size
201 }
202 }
203
204 // go through the diagonal blocks
205 for (int i = 0; i < block_count; i++) {
206
207 // insert a task that processes the current diagonal block
208 starpu_task_insert(
209 &small_lu_cl, // codelet
210 STARPU_PRIORITY, // the next argument specifies the priority
211 STARPU_MAX_PRIO, // priority
212 STARPU_RW, // the next argument is a read-write handle
213 blocks[i][i], // handle to the diagonal block
214 0); // a null pointer finalizes the call
215
216 // insert tasks that process the blocks to the right of the current
217 // diagonal block
218 for (int j = i+1; j < block_count; j++) {
219
220 // blocks[i][j] <- L1(blocks[i][i]) \ blocks[i][j]
221 starpu_task_insert(&rc_update_cl,
222 STARPU_PRIORITY, MAX(STARPU_MIN_PRIO, STARPU_MAX_PRIO-j+i),
223 STARPU_VALUE, // the next argument is a static argument
224 "Left", // pointer to the static argument
225 sizeof(char), // size of the static argument
226 STARPU_VALUE, "Lower", sizeof(char),
227 STARPU_VALUE, "No transpose", sizeof(char),
228 STARPU_VALUE, "Unit triangular", sizeof(char),
229 STARPU_R, blocks[i][i],
230 STARPU_RW, blocks[i][j], 0);
231 }
232
233 // insert tasks that process the blocks below the current diagonal block
234 for (int j = i+1; j < block_count; j++) {
235
236 // blocks[j][i] <- U(blocks[i][i]) / blocks[j][i]
237 starpu_task_insert(&rc_update_cl,
238 STARPU_PRIORITY, MAX(STARPU_MIN_PRIO, STARPU_MAX_PRIO-j+i),
239 STARPU_VALUE, "Right", sizeof(char),
240 STARPU_VALUE, "Upper", sizeof(char),
241 STARPU_VALUE, "No transpose", sizeof(char),
242 STARPU_VALUE, "Not unit triangular", sizeof(char),
243 STARPU_R, blocks[i][i],
244 STARPU_RW, blocks[j][i], 0);
245 }
246
247 // insert tasks that process the trailing matrix
248 for (int ii = i+1; ii < block_count; ii++) {
249 for (int jj = i+1; jj < block_count; jj++) {
250
251 // blocks[ii][jj] <-
252 // blocks[ii][jj] - blocks[ii][i] * blocks[i][jj]
253 starpu_task_insert(&trail_update_cl,
254 STARPU_PRIORITY, MAX(
255 MAX(STARPU_MIN_PRIO, STARPU_MAX_PRIO-ii+i),
256 MAX(STARPU_MIN_PRIO, STARPU_MAX_PRIO-jj+i)),
257 STARPU_R, blocks[ii][i],
258 STARPU_R, blocks[i][jj],
259 STARPU_RW, blocks[ii][jj], 0);
260 }
261 }
262 }
263
264 // free allocated resources
265 for (int i = 0; i < block_count; i++) {
266 for (int j = 0; j < block_count; j++) {
267
268 // The data handles must be unregistered. The main thread waits
269 // until all related tasks have been completed and the data is
270 // copied back to its original location.
271 starpu_data_unregister(blocks[i][j]);
272 }
273 free(blocks[i]);
274 }
275 free(blocks);
276
277 starpu_shutdown();
278}
279
280
281////////////////////////////////////////////////////////////////////////////
282////////////////////////////////////////////////////////////////////////////
283////////////////////////////////////////////////////////////////////////////
284
285// computes C <- L * U
286void mul_lu(int n, int lda, int ldb, double const *A, double *B)
287{
288 // B <- U(A) = U
289 for (int i = 0; i < n; i++) {
290 for (int j = 0; j < i+1; j++)
291 B[i*ldb+j] = A[i*lda+j];
292 for (int j = i+1; j < n; j++)
293 B[i*ldb+j] = 0.0;
294 }
295
296 // B <- L1(A) * B = L * U
297 dtrmm_("Left", "Lower", "No Transpose", "Unit triangular",
298 &n, &n, &one, A, &lda, B, &ldb);
299}
300
301int main(int argc, char **argv)
302{
303 //
304 // check arguments
305 //
306
307 if (argc != 3) {
308 fprintf(stderr,
309 "[error] Incorrect arguments. Use %s (n) (block size)\n", argv[0]);
310 return EXIT_FAILURE;
311 }
312
313 int n = atoi(argv[1]);
314 if (n < 1) {
315 fprintf(stderr, "[error] Invalid matrix dimension.\n");
316 return EXIT_FAILURE;
317 }
318
319 int block_size = atoi(argv[2]);
320 if (block_size < 2) {
321 fprintf(stderr, "[error] Invalid block size.\n");
322 return EXIT_FAILURE;
323 }
324
325 //
326 // Initialize matrix A and store a duplicate to matrix B. Matrix C is for
327 // validation.
328 //
329
330 srand(time(NULL));
331
332 int ldA, ldB, ldC;
333 ldA = ldB = ldC = DIVCEIL(n, 8)*8; // align to 64 bytes
334 double *A = (double *) aligned_alloc(8, n*ldA*sizeof(double));
335 double *B = (double *) aligned_alloc(8, n*ldB*sizeof(double));
336 double *C = (double *) aligned_alloc(8, n*ldC*sizeof(double));
337
338 if (A == NULL || B == NULL || C == NULL) {
339 fprintf(stderr, "[error] Failed to allocate memory.\n");
340 return EXIT_FAILURE;
341 }
342
343 // A <- random diagonally dominant matrix
344 for (int i = 0; i < n; i++) {
345 for (int j = 0; j < n; j++)
346 A[i*ldA+j] = B[i*ldB+j] = 2.0*rand()/RAND_MAX - 1.0;
347 A[i*ldA+i] = B[i*ldB+i] = 1.0*rand()/RAND_MAX + n;
348 }
349
350 //
351 // compute
352 //
353
354 struct timespec ts_start;
355 clock_gettime(CLOCK_MONOTONIC, &ts_start);
356
357 // A <- (L,U)
358 blocked_lu(block_size, n, ldA, A);
359
360 struct timespec ts_stop;
361 clock_gettime(CLOCK_MONOTONIC, &ts_stop);
362
363 printf("Time = %f s\n",
364 ts_stop.tv_sec - ts_start.tv_sec +
365 1.0E-9*(ts_stop.tv_nsec - ts_start.tv_nsec));
366
367 // C <- L * U
368 mul_lu(n, ldA, ldC, A, C);
369
370 //
371 // validate
372 //
373
374 // C <- L * U - B
375 for (int i = 0; i < n; i++)
376 for (int j = 0; j < n; j++)
377 C[i*ldC+j] -= B[i*ldB+j];
378
379 // compute || C ||_F / || B ||_F = || L * U - B ||_F / || B ||_F
380 double residual = dlange_("Frobenius", &n, &n, C, &ldC, NULL) /
381 dlange_("Frobenius", &n, &n, B, &ldB, NULL);
382
383 printf("Residual = %E\n", residual);
384
385 int ret = EXIT_SUCCESS;
386 if (1.0E-12 < residual) {
387 fprintf(stderr, "The residual is too large.\n");
388 ret = EXIT_FAILURE;
389 }
390
391 //
392 // cleanup
393 //
394
395 free(A);
396 free(B);
397 free(C);
398
399 return ret;
400}