Example: LU factorization

Objectives

  • Apply task-based parallelism to a real algorithm.

Scalar algorithm

The algorithm we are considering is called LU factorization (or LU decomposition) without pivoting. Let \(A\) be a square matrix. An LU factorization refers to the factorization of \(A\) into a lower triangular matrix \(L\) and an upper triangular matrix \(U\):

\[A = L U.\]

For example, consider the following example from Wikipedia:

\[\begin{split}\left[ \begin{matrix} 4 & 3 \\ 6 & 3 \end{matrix} \right] = \left[ \begin{matrix} 1 & 0 \\ 1.5 & 1 \end{matrix} \right] \left[ \begin{matrix} 4 & 3 \\ 0 & -1.5 \end{matrix} \right].\end{split}\]

If we want to solve a system of linear equations

\[A x = b\]

for \(x\) and have already computed the LU factorization

\[A = A U,\]

then we can solve \(A x = b\) as follows:

  1. Solve the equation \(L y = b\) for \(y\).

  2. Solve the equation \(U x = y\) for \(x\).

The factorization is typically normalized such that the matrix \(L\) is unit lower triangular, i.e., the diagonal entries are all ones. This allows us to store the factor matrices on top of the original matrix:

\[A \leftarrow U + L - I.\]

In case of the earlier example, we get

\[\begin{split}\left[ \begin{matrix} 4 & 3 \\ 0 & -1.5 \end{matrix} \right] + \left[ \begin{matrix} 1 & 0 \\ 1.5 & 1 \end{matrix} \right] - \left[ \begin{matrix} 1 & 0 \\ 0 & 1 \end{matrix} \right] = \left[ \begin{matrix} 4 & 3 \\ 1.5 & -1.5 \end{matrix} \right].\end{split}\]

Scalar implementation

The scalar sequential version of the algorithm is rather simple:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
void simple_lu(int n, int ld, double *A)
{
    for (int i = 0; i < n; i++) {
        for (int j = i+1; j < n; j++) {
            A[i*ld+j] /= A[i*ld+i];

            for (int k = i+1; k < n; k++)
                A[k*ld+j] -= A[i*ld+j] * A[k*ld+i];
        }
    }
}

The LU factorization of the \(n \times n\) matrix \(A\) (column-major order with leading dimension ld) is stored on top of the original matrix \(A\).

Test program

We can validate the result with the following test program:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>

extern void dtrmm_(char const *, char const *, char const *, char const *,
    int const *, int const *, double const *, double const *, int const *,
    double *, int const *);

extern double dlange_(char const *, int const *, int const *, double const *,
    int const *, double *);

void simple_lu(int n, int ld, double *A)
{
    for (int i = 0; i < n; i++) {
        for (int j = i+1; j < n; j++) {
            A[i*ld+j] /= A[i*ld+i];

            for (int k = i+1; k < n; k++)
                A[k*ld+j] -= A[i*ld+j] * A[k*ld+i];
        }
    }
}

// returns the ceil of a / b
int DIVCEIL(int a, int b)
{
    return (a+b-1)/b;
}

// computes C <- L * U
void mul_lu(int n, int lda, int ldb, double const *A, double *B)
{
    double one = 1.0;

    // B <- U(A) = U
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < i+1; j++)
            B[i*ldb+j] = A[i*lda+j];
        for (int j = i+1; j < n; j++)
            B[i*ldb+j] = 0.0;
    }

    // B <- L1(A) * B = L * U
    dtrmm_("Left", "Lower", "No Transpose", "Unit triangular",
        &n, &n, &one, A, &lda, B, &ldb);
}

int main(int argc, char **argv)
{
    //
    // check arguments
    //

    if (argc != 2) {
        fprintf(stderr,
            "[error] Incorrect arguments. Use %s (n)\n", argv[0]);
        return EXIT_FAILURE;
    }

    int n = atoi(argv[1]);
    if (n < 1)  {
        fprintf(stderr, "[error] Invalid matrix dimension.\n");
        return EXIT_FAILURE;
    }

    //
    // Initialize matrix A and store a duplicate to matrix B. Matrix C is for
    // validation.
    //

    srand(time(NULL));

    int ldA, ldB, ldC;
    ldA = ldB = ldC = DIVCEIL(n, 8)*8; // align to 64 bytes
    double *A = (double *) aligned_alloc(8, n*ldA*sizeof(double));
    double *B = (double *) aligned_alloc(8, n*ldB*sizeof(double));
    double *C = (double *) aligned_alloc(8, n*ldC*sizeof(double));

    if (A == NULL || B == NULL || C == NULL) {
        fprintf(stderr, "[error] Failed to allocate memory.\n");
        return EXIT_FAILURE;
    }

    // A <- random diagonally dominant matrix
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++)
            A[i*ldA+j] = B[i*ldB+j] = 2.0*rand()/RAND_MAX - 1.0;
        A[i*ldA+i] = B[i*ldB+i] = 1.0*rand()/RAND_MAX + n;
    }

    //
    // compute
    //

    struct timespec ts_start;
    clock_gettime(CLOCK_MONOTONIC, &ts_start);

    // A <- (L,U)
    simple_lu(n, ldA, A);

    struct timespec ts_stop;
    clock_gettime(CLOCK_MONOTONIC, &ts_stop);

    printf("Time = %f s\n",
        ts_stop.tv_sec - ts_start.tv_sec +
        1.0E-9*(ts_stop.tv_nsec - ts_start.tv_nsec));

    // C <- L * U
    mul_lu(n, ldA, ldC, A, C);

    //
    // validate
    //

    // C <- L * U - B
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            C[i*ldC+j] -= B[i*ldB+j];

    // compute || C ||_F / || B ||_F = || L * U - B ||_F  / || B ||_F
    double residual = dlange_("Frobenius", &n, &n, C, &ldC, NULL) /
        dlange_("Frobenius", &n, &n, B, &ldB, NULL);

    printf("Residual = %E\n", residual);

    int ret = EXIT_SUCCESS;
    if (1.0E-12 < residual) {
        fprintf(stderr, "The residual is too large.\n");
        ret = EXIT_FAILURE;
    }

    //
    // cleanup
    //

    free(A);
    free(B);
    free(C);

    return ret;
}

We can compile and test the algorithm:

$ gcc -o scalar scalar.c  -Wall ${LIBLAPACK} ${LIBBLAS}
$ OMP_NUM_THREADS=1 ./scalar 3000
Time = 120.412646 s
Residual = 1.780586E-15

Above, ${LIBLAPACK} and ${LIBBLAS} are the LAPACK and BLAS libraries, respectively.

Challenge

Challenge

Compile and run the scalar implementation yourself.

Coarsely-blocked algorithm

The above scalar algorithm cannot be parallelized efficiently because we cannot collapse the loops and the innermost loop is too short for effective parallelization.

Coarsely-blocked implementation

We can solve some of the parallelization problems by writing the algorithm in a blocked form:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
void blocked_lu(int block_size, int n, int ldA, double *A)
{
    // allocate and fill an array that stores the block pointers
    int block_count = DIVCEIL(n, block_size);
    double ***blocks = (double ***) malloc(block_count*sizeof(double**));
    for (int i = 0; i < block_count; i++) {
        blocks[i] = (double **) malloc(block_count*sizeof(double*));

        for (int j = 0; j < block_count; j++)
            blocks[i][j] = A+(j*ldA+i)*block_size;
    }

    // iterate through the diagonal blocks
    for (int i = 0; i < block_count; i++) {

        // calculate diagonal block size
        int dsize = min(block_size, n-i*block_size);

        // calculate trailing matrix size
        int tsize = n-(i+1)*block_size;

        // compute the LU decomposition of the diagonal block
        simple_lu(dsize, ldA, blocks[i][i]);

        if (0 < tsize) {

            // blocks[i][i+1:] <- L1(blocks[i][i]) \ blocks[i][i+1:]
            dtrsm_("Left", "Lower", "No transpose", "Unit triangular",
                &dsize, &tsize, &one, blocks[i][i], &ldA, blocks[i][i+1], &ldA);

            // blocks[i+1:][i] <- U(blocks[i][i]) / blocks[i+1:][i]
            dtrsm_("Right", "Upper", "No Transpose", "Not unit triangular",
                &tsize, &dsize, &one, blocks[i][i], &ldA, blocks[i+1][i], &ldA);

            // blocks[i+1:][i+1:] <- blocks[i+1:][i+1:] -
            //                          blocks[i+1:][i] * blocks[i][i+1:]
            dgemm_("No Transpose", "No Transpose",
                &tsize, &tsize, &dsize, &minus_one, blocks[i+1][i],
                &ldA, blocks[i][i+1], &ldA, &one, blocks[i+1][i+1], &ldA);
        }
    }

    // free allocated resources
    for (int i = 0; i < block_count; i++)
        free(blocks[i]);
    free(blocks);
}

We have divided the matrix \(A\) into square blocks (possibly excluding the last block row and column). The two-dimensional array blocks contains the addresses of the upper-left corners of the blocks. That is, blocks[i-1][j-1] is the block on the i’th row and j’th column of the matrix. We then loop over the diagonal blocks as follows:

  1. We factorize the diagonal blocks using the scalar algorithm (simple_lu).

  2. The sub-factor matrices are used to update the block row and the block column by solving two matrix equations (dtrsm_).

  3. The trailing matrix is updated by computing a matrix-matrix multiplication (dgemm_).

../_images/blocked_lu1.png

The exact details are not that relevant as we are only interested in the data dependencies.

Test program

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>

extern double dnrm2_(int const *, double const *, int const *);

extern void dtrmm_(char const *, char const *, char const *, char const *,
    int const *, int const *, double const *, double const *, int const *,
    double *, int const *);

extern void dlacpy_(char const *, int const *, int const *, double const *,
    int const *, double *, int const *);

extern double dlange_(char const *, int const *, int const *, double const *,
    int const *, double *);

extern void dtrsm_(char const *, char const *, char const *, char const *,
    int const *, int const *, double const *, double const *, int const *,
    double *, int const *);

extern void dgemm_(char const *, char const *, int const *, int const *,
    int const *, double const *, double const *, int const *, double const *,
    int const *, double const *, double *, int const *);

double one = 1.0;
double minus_one = -1.0;

// returns the ceil of a / b
int DIVCEIL(int a, int b)
{
    return (a+b-1)/b;
}

// returns the minimum of a and b
int MIN(int a, int b)
{
    return a < b ? a : b;
}

// returns the maxinum of a and b
int MAX(int a, int b)
{
    return a > b ? a : b;
}

void simple_lu(int n, int ldA, double *A)
{
    for (int i = 0; i < n; i++) {
        for (int j = i+1; j < n; j++) {
            A[i*ldA+j] /= A[i*ldA+i];

            for (int k = i+1; k < n; k++)
                A[k*ldA+j] -= A[i*ldA+j] * A[k*ldA+i];
        }
    }
}

////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////

void blocked_lu(int block_size, int n, int ldA, double *A)
{
    // allocate and fill an array that stores the block pointers
    int block_count = DIVCEIL(n, block_size);
    double ***blocks = (double ***) malloc(block_count*sizeof(double**));
    for (int i = 0; i < block_count; i++) {
        blocks[i] = (double **) malloc(block_count*sizeof(double*));

        for (int j = 0; j < block_count; j++)
            blocks[i][j] = A+(j*ldA+i)*block_size;
    }

    // iterate through the diagonal blocks
    for (int i = 0; i < block_count; i++) {

        // calculate diagonal block size
        int dsize = MIN(block_size, n-i*block_size);

        // calculate trailing matrix size
        int tsize = n-(i+1)*block_size;

        // compute the LU decomposition of the diagonal block
        simple_lu(dsize, ldA, blocks[i][i]);

        if (0 < tsize) {

            // blocks[i][i+1:] <- L1(blocks[i][i]) \ blocks[i][i+1:]
            dtrsm_("Left", "Lower", "No transpose", "Unit triangular",
                &dsize, &tsize, &one, blocks[i][i], &ldA, blocks[i][i+1], &ldA);

            // blocks[i+1:][i] <- U(blocks[i][i]) / blocks[i+1:][i]
            dtrsm_("Right", "Upper", "No Transpose", "Not unit triangular",
                &tsize, &dsize, &one, blocks[i][i], &ldA, blocks[i+1][i], &ldA);

            // blocks[i+1:][i+1:] <- blocks[i+1:][i+1:] -
            //                          blocks[i+1:][i] * blocks[i][i+1:]
            dgemm_("No Transpose", "No Transpose",
                &tsize, &tsize, &dsize, &minus_one, blocks[i+1][i],
                &ldA, blocks[i][i+1], &ldA, &one, blocks[i+1][i+1], &ldA);
        }
    }

    // free allocated resources
    for (int i = 0; i < block_count; i++)
        free(blocks[i]);
    free(blocks);
}

////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////

// computes C <- L * U
void mul_lu(int n, int lda, int ldb, double const *A, double *B)
{
    // B <- U(A) = U
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < i+1; j++)
            B[i*ldb+j] = A[i*lda+j];
        for (int j = i+1; j < n; j++)
            B[i*ldb+j] = 0.0;
    }

    // B <- L1(A) * B = L * U
    dtrmm_("Left", "Lower", "No Transpose", "Unit triangular",
        &n, &n, &one, A, &lda, B, &ldb);
}

int main(int argc, char **argv)
{
    //
    // check arguments
    //

    if (argc != 3) {
        fprintf(stderr,
            "[error] Incorrect arguments. Use %s (n) (block size)\n", argv[0]);
        return EXIT_FAILURE;
    }

    int n = atoi(argv[1]);
    if (n < 1)  {
        fprintf(stderr, "[error] Invalid matrix dimension.\n");
        return EXIT_FAILURE;
    }

    int block_size = atoi(argv[2]);
    if (block_size < 2)  {
        fprintf(stderr, "[error] Invalid block size.\n");
        return EXIT_FAILURE;
    }

    //
    // Initialize matrix A and store a duplicate to matrix B. Matrix C is for
    // validation.
    //

    srand(time(NULL));

    int ldA, ldB, ldC;
    ldA = ldB = ldC = DIVCEIL(n, 8)*8; // align to 64 bytes
    double *A = (double *) aligned_alloc(8, n*ldA*sizeof(double));
    double *B = (double *) aligned_alloc(8, n*ldB*sizeof(double));
    double *C = (double *) aligned_alloc(8, n*ldC*sizeof(double));

    if (A == NULL || B == NULL || C == NULL) {
        fprintf(stderr, "[error] Failed to allocate memory.\n");
        return EXIT_FAILURE;
    }

    // A <- random diagonally dominant matrix
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++)
            A[i*ldA+j] = B[i*ldB+j] = 2.0*rand()/RAND_MAX - 1.0;
        A[i*ldA+i] = B[i*ldB+i] = 1.0*rand()/RAND_MAX + n;
    }

    //
    // compute
    //

    struct timespec ts_start;
    clock_gettime(CLOCK_MONOTONIC, &ts_start);

    // A <- (L,U)
    blocked_lu(block_size, n, ldA, A);

    struct timespec ts_stop;
    clock_gettime(CLOCK_MONOTONIC, &ts_stop);

    printf("Time = %f s\n",
        ts_stop.tv_sec - ts_start.tv_sec +
        1.0E-9*(ts_stop.tv_nsec - ts_start.tv_nsec));

    // C <- L * U
    mul_lu(n, ldA, ldC, A, C);

    //
    // validate
    //

    // C <- L * U - B
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            C[i*ldC+j] -= B[i*ldB+j];

    // compute || C ||_F / || B ||_F = || L * U - B ||_F  / || B ||_F
    double residual = dlange_("Frobenius", &n, &n, C, &ldC, NULL) /
        dlange_("Frobenius", &n, &n, B, &ldB, NULL);

    printf("Residual = %E\n", residual);

    int ret = EXIT_SUCCESS;
    if (1.0E-12 < residual) {
        fprintf(stderr, "The residual is too large.\n");
        ret = EXIT_FAILURE;
    }

    //
    // cleanup
    //

    free(A);
    free(B);
    free(C);

    return ret;
}

We can compile and test the algorithm:

$ gcc -o coarse-blocked coarse-blocked.c -Wall ${LIBLAPACK} ${LIBBLAS}
$ OMP_NUM_THREADS=1 ./coarse-blocked 3000 128
Time = 0.480420 s
Residual = 3.917427E-16

The second argument is the block size. We can see that the blocked variant is significantly faster even before it is parallelized.

Challenge

Challenge

Try to parallelize the block row on column updates (dtrsm_). Try both

  • the sections and section constructs and

  • the task construct.

Do you notice any difference in the run time?

Finely-blocked algorithm

If we want to reach a reasonable level of parallelism, we must make the task granularity finer:

../_images/blocked_lu2.png

Finely-blocked implementation

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
void blocked_lu(int block_size, int n, int ldA, double *A)
{
    // allocate and fill an array that stores the block pointers
    int block_count = DIVCEIL(n, block_size);
    double ***blocks = (double ***) malloc(block_count*sizeof(double**));
    for (int i = 0; i < block_count; i++) {
        blocks[i] = (double **) malloc(block_count*sizeof(double*));

        for (int j = 0; j < block_count; j++)
            blocks[i][j] = A+(j*ldA+i)*block_size;
    }

    // iterate through the diagonal blocks
    for (int i = 0; i < block_count; i++) {

        // calculate diagonal block size
        int dsize = MIN(block_size, n-i*block_size);

        // process the current diagonal block
        //
        // +--+--+--+--+
        // |  |  |  |  |
        // +--+--+--+--+   ## - process (read-write)
        // |  |##|  |  |
        // +--+--+--+--+
        // |  |  |  |  |
        // +--+--+--+--+
        // |  |  |  |  |
        // +--+--+--+--+
        //
        simple_lu(dsize, ldA, blocks[i][i]);

        // process the blocks to the right of the current diagonal block
        for (int j = i+1; j < block_count; j++) {
            int width = MIN(block_size, n-j*block_size);

            // blocks[i][j] <- L1(blocks[i][i]) \ blocks[i][j]
            //
            // +--+--+--+--+
            // |  |  |  |  |
            // +--+--+--+--+   ## - process (read-write), current iteration
            // |  |rr|##|xx|   xx - process (read-write)
            // +--+--+--+--+   rr - read
            // |  |  |  |  |
            // +--+--+--+--+
            // |  |  |  |  |
            // +--+--+--+--+
            //
            dtrsm_("Left", "Lower", "No transpose", "Unit triangular",
                &dsize, &width, &one, blocks[i][i], &ldA, blocks[i][j], &ldA);
        }

        // process the blocks below the current diagonal block
        for (int j = i+1; j < block_count; j++) {
            int height = MIN(block_size, n-j*block_size);

            // blocks[j][i] <- U(blocks[i][i]) / blocks[j][i]
            //
            // +--+--+--+--+
            // |  |  |  |  |
            // +--+--+--+--+   ## - process (read-write), current iteration
            // |  |rr|  |  |   xx - process (read-write)
            // +--+--+--+--+   rr - read
            // |  |##|  |  |
            // +--+--+--+--+
            // |  |xx|  |  |
            // +--+--+--+--+
            //
            dtrsm_("Right", "Upper", "No Transpose", "Not unit triangular",
                &height, &dsize, &one, blocks[i][i], &ldA, blocks[j][i], &ldA);
        }

        // process the trailing matrix

        for (int ii = i+1; ii < block_count; ii++) {
            for (int jj = i+1; jj < block_count; jj++) {
                int width = MIN(block_size, n-jj*block_size);
                int height = MIN(block_size, n-ii*block_size);

                // blocks[ii][jj] <-
                //               blocks[ii][jj] - blocks[ii][i] * blocks[i][jj]
                //
                // +--+--+--+--+
                // |  |  |  |  |
                // +--+--+--+--+   ## - process (read-write), current iteration
                // |  |  |rr|..|   xx - process (read-write)
                // +--+--+--+--+   rr - read, current iteration
                // |  |rr|##|xx|   .. - read
                // +--+--+--+--+
                // |  |..|xx|xx|
                // +--+--+--+--+
                //
                dgemm_("No Transpose", "No Transpose",
                    &height, &width, &dsize, &minus_one, blocks[ii][i],
                    &ldA, blocks[i][jj], &ldA, &one, blocks[ii][jj], &ldA);
            }
        }

    }

    // free allocated resources
    for (int i = 0; i < block_count; i++)
        free(blocks[i]);
    free(blocks);
}

In particular, the above code divides the trailing matrix update into numerous sub-tasks.

Test program

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>

extern double dnrm2_(int const *, double const *, int const *);

extern void dtrmm_(char const *, char const *, char const *, char const *,
    int const *, int const *, double const *, double const *, int const *,
    double *, int const *);

extern void dlacpy_(char const *, int const *, int const *, double const *,
    int const *, double *, int const *);

extern double dlange_(char const *, int const *, int const *, double const *,
    int const *, double *);

extern void dtrsm_(char const *, char const *, char const *, char const *,
    int const *, int const *, double const *, double const *, int const *,
    double *, int const *);

extern void dgemm_(char const *, char const *, int const *, int const *,
    int const *, double const *, double const *, int const *, double const *,
    int const *, double const *, double *, int const *);

double one = 1.0;
double minus_one = -1.0;

// returns the ceil of a / b
int DIVCEIL(int a, int b)
{
    return (a+b-1)/b;
}

// returns the minimum of a and b
int MIN(int a, int b)
{
    return a < b ? a : b;
}

// returns the maxinum of a and b
int MAX(int a, int b)
{
    return a > b ? a : b;
}

void simple_lu(int n, int ldA, double *A)
{
    for (int i = 0; i < n; i++) {
        for (int j = i+1; j < n; j++) {
            A[i*ldA+j] /= A[i*ldA+i];

            for (int k = i+1; k < n; k++)
                A[k*ldA+j] -= A[i*ldA+j] * A[k*ldA+i];
        }
    }
}

////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////

void blocked_lu(int block_size, int n, int ldA, double *A)
{
    // allocate and fill an array that stores the block pointers
    int block_count = DIVCEIL(n, block_size);
    double ***blocks = (double ***) malloc(block_count*sizeof(double**));
    for (int i = 0; i < block_count; i++) {
        blocks[i] = (double **) malloc(block_count*sizeof(double*));

        for (int j = 0; j < block_count; j++)
            blocks[i][j] = A+(j*ldA+i)*block_size;
    }

    // iterate through the diagonal blocks
    for (int i = 0; i < block_count; i++) {

        // calculate diagonal block size
        int dsize = MIN(block_size, n-i*block_size);

        // process the current diagonal block
        //
        // +--+--+--+--+
        // |  |  |  |  |
        // +--+--+--+--+   ## - process (read-write)
        // |  |##|  |  |
        // +--+--+--+--+
        // |  |  |  |  |
        // +--+--+--+--+
        // |  |  |  |  |
        // +--+--+--+--+
        //
        simple_lu(dsize, ldA, blocks[i][i]);

        // process the blocks to the right of the current diagonal block
        for (int j = i+1; j < block_count; j++) {
            int width = MIN(block_size, n-j*block_size);

            // blocks[i][j] <- L1(blocks[i][i]) \ blocks[i][j]
            //
            // +--+--+--+--+
            // |  |  |  |  |
            // +--+--+--+--+   ## - process (read-write), current iteration
            // |  |rr|##|xx|   xx - process (read-write)
            // +--+--+--+--+   rr - read
            // |  |  |  |  |
            // +--+--+--+--+
            // |  |  |  |  |
            // +--+--+--+--+
            //
            dtrsm_("Left", "Lower", "No transpose", "Unit triangular",
                &dsize, &width, &one, blocks[i][i], &ldA, blocks[i][j], &ldA);
        }

        // process the blocks below the current diagonal block
        for (int j = i+1; j < block_count; j++) {
            int height = MIN(block_size, n-j*block_size);

            // blocks[j][i] <- U(blocks[i][i]) / blocks[j][i]
            //
            // +--+--+--+--+
            // |  |  |  |  |
            // +--+--+--+--+   ## - process (read-write), current iteration
            // |  |rr|  |  |   xx - process (read-write)
            // +--+--+--+--+   rr - read
            // |  |##|  |  |
            // +--+--+--+--+
            // |  |xx|  |  |
            // +--+--+--+--+
            //
            dtrsm_("Right", "Upper", "No Transpose", "Not unit triangular",
                &height, &dsize, &one, blocks[i][i], &ldA, blocks[j][i], &ldA);
        }

        // process the trailing matrix

        for (int ii = i+1; ii < block_count; ii++) {
            for (int jj = i+1; jj < block_count; jj++) {
                int width = MIN(block_size, n-jj*block_size);
                int height = MIN(block_size, n-ii*block_size);

                // blocks[ii][jj] <-
                //               blocks[ii][jj] - blocks[ii][i] * blocks[i][jj]
                //
                // +--+--+--+--+
                // |  |  |  |  |
                // +--+--+--+--+   ## - process (read-write), current iteration
                // |  |  |rr|..|   xx - process (read-write)
                // +--+--+--+--+   rr - read, current iteration
                // |  |rr|##|xx|   .. - read
                // +--+--+--+--+
                // |  |..|xx|xx|
                // +--+--+--+--+
                //
                dgemm_("No Transpose", "No Transpose",
                    &height, &width, &dsize, &minus_one, blocks[ii][i],
                    &ldA, blocks[i][jj], &ldA, &one, blocks[ii][jj], &ldA);
            }
        }

    }

    // free allocated resources
    for (int i = 0; i < block_count; i++)
        free(blocks[i]);
    free(blocks);
}

////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////

// computes C <- L * U
void mul_lu(int n, int lda, int ldb, double const *A, double *B)
{
    // B <- U(A) = U
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < i+1; j++)
            B[i*ldb+j] = A[i*lda+j];
        for (int j = i+1; j < n; j++)
            B[i*ldb+j] = 0.0;
    }

    // B <- L1(A) * B = L * U
    dtrmm_("Left", "Lower", "No Transpose", "Unit triangular",
        &n, &n, &one, A, &lda, B, &ldb);
}

int main(int argc, char **argv)
{
    //
    // check arguments
    //

    if (argc != 3) {
        fprintf(stderr,
            "[error] Incorrect arguments. Use %s (n) (block size)\n", argv[0]);
        return EXIT_FAILURE;
    }

    int n = atoi(argv[1]);
    if (n < 1)  {
        fprintf(stderr, "[error] Invalid matrix dimension.\n");
        return EXIT_FAILURE;
    }

    int block_size = atoi(argv[2]);
    if (block_size < 2)  {
        fprintf(stderr, "[error] Invalid block size.\n");
        return EXIT_FAILURE;
    }

    //
    // Initialize matrix A and store a duplicate to matrix B. Matrix C is for
    // validation.
    //

    srand(time(NULL));

    int ldA, ldB, ldC;
    ldA = ldB = ldC = DIVCEIL(n, 8)*8; // align to 64 bytes
    double *A = (double *) aligned_alloc(8, n*ldA*sizeof(double));
    double *B = (double *) aligned_alloc(8, n*ldB*sizeof(double));
    double *C = (double *) aligned_alloc(8, n*ldC*sizeof(double));

    if (A == NULL || B == NULL || C == NULL) {
        fprintf(stderr, "[error] Failed to allocate memory.\n");
        return EXIT_FAILURE;
    }

    // A <- random diagonally dominant matrix
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++)
            A[i*ldA+j] = B[i*ldB+j] = 2.0*rand()/RAND_MAX - 1.0;
        A[i*ldA+i] = B[i*ldB+i] = 1.0*rand()/RAND_MAX + n;
    }

    //
    // compute
    //

    struct timespec ts_start;
    clock_gettime(CLOCK_MONOTONIC, &ts_start);

    // A <- (L,U)
    blocked_lu(block_size, n, ldA, A);

    struct timespec ts_stop;
    clock_gettime(CLOCK_MONOTONIC, &ts_stop);

    printf("Time = %f s\n",
        ts_stop.tv_sec - ts_start.tv_sec +
        1.0E-9*(ts_stop.tv_nsec - ts_start.tv_nsec));

    // C <- L * U
    mul_lu(n, ldA, ldC, A, C);

    //
    // validate
    //

    // C <- L * U - B
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            C[i*ldC+j] -= B[i*ldB+j];

    // compute || C ||_F / || B ||_F = || L * U - B ||_F  / || B ||_F
    double residual = dlange_("Frobenius", &n, &n, C, &ldC, NULL) /
        dlange_("Frobenius", &n, &n, B, &ldB, NULL);

    printf("Residual = %E\n", residual);

    int ret = EXIT_SUCCESS;
    if (1.0E-12 < residual) {
        fprintf(stderr, "The residual is too large.\n");
        ret = EXIT_FAILURE;
    }

    //
    // cleanup
    //

    free(A);
    free(B);
    free(C);

    return ret;
}
$ gcc -o finely-blocked finely-blocked.c -Wall ${LIBLAPACK} ${LIBBLAS}
$ OMP_NUM_THREADS=1 ./finely-blocked 3000 128
Time = 0.587407 s
Residual = 3.958161E-16

Challenge

Challenge

Parallelize the finely-blocked algorithm using task constructs and depend clauses.

Priorities

We can prioritize the critical path by using the following offsets:

../_images/priorities.png

Above, the priority is given by omp_get_max_task_priority() - offset.

Challenge

Parallelize the finely-blocked algorithm using task constructs and depend and priority clauses.