-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmat_add.cu
More file actions
129 lines (106 loc) · 3.34 KB
/
mat_add.cu
File metadata and controls
129 lines (106 loc) · 3.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/* File: mat_add.cu
* Purpose: Implement matrix addition on a gpu using cuda
*
* Compile: nvcc [-g] [-G] -arch=sm_21 -o mat_add mat_add.cu
* Run: ./mat_add <m> <n>
* m is the number of rows
* n is the number of columns
*
* Input: The matrices A and B
* Output: Result of matrix addition.
*
* Notes:
* 1. There are m blocks with n threads each.
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
/*---------------------------------------------------------------------
* Kernel: Mat_add
* Purpose: Implement matrix addition
* In args: A, B, m, n
* Out arg: C
*/
__global__ void Mat_add(float A[], float B[], float C[], int m, int n) {
/* blockDim.x = threads_per_block */
/* First block gets first threads_per_block components. */
/* Second block gets next threads_per_block components, etc. */
int my_ij = blockDim.x * blockIdx.x + threadIdx.x;
/* The test shouldn't be necessary */
if (blockIdx.x < m && threadIdx.x < n)
C[my_ij] = A[my_ij] + B[my_ij];
} /* Mat_add */
/*---------------------------------------------------------------------
* Function: Read_matrix
* Purpose: Read an m x n matrix from stdin
* In args: m, n
* Out arg: A
*/
void Read_matrix(float A[], int m, int n) {
int i, j;
for (i = 0; i < m; i++)
for (j = 0; j < n; j++)
scanf("%f", &A[i*n+j]);
} /* Read_matrix */
/*---------------------------------------------------------------------
* Function: Print_matrix
* Purpose: Print an m x n matrix to stdout
* In args: title, A, m, n
*/
void Print_matrix(char title[], float A[], int m, int n) {
int i, j;
printf("%s\n", title);
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++)
printf("%.1f ", A[i*n+j]);
printf("\n");
}
} /* Print_matrix */
/* Host code */
int main(int argc, char* argv[]) {
int m, n;
float *h_A, *h_B, *h_C;
float *d_A, *d_B, *d_C;
size_t size;
/* Get size of matrices */
if (argc != 3) {
fprintf(stderr, "usage: %s <row count> <col count>\n", argv[0]);
exit(0);
}
m = strtol(argv[1], NULL, 10);
n = strtol(argv[2], NULL, 10);
printf("m = %d, n = %d\n", m, n);
size = m*n*sizeof(float);
h_A = (float*) malloc(size);
h_B = (float*) malloc(size);
h_C = (float*) malloc(size);
printf("Enter the matrices A and B\n");
Read_matrix(h_A, m, n);
Read_matrix(h_B, m, n);
Print_matrix("A =", h_A, m, n);
Print_matrix("B =", h_B, m, n);
/* Allocate matrices in device memory */
cudaMalloc(&d_A, size);
cudaMalloc(&d_B, size);
cudaMalloc(&d_C, size);
/* Copy matrices from host memory to device memory */
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
/* Invoke kernel using m thread blocks, each of */
/* which contains n threads */
Mat_add<<<m, n>>>(d_A, d_B, d_C, m, n);
/* Wait for the kernel to complete */
cudaThreadSynchronize();
/* Copy result from device memory to host memory */
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
Print_matrix("The sum is: ", h_C, m, n);
/* Free device memory */
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
/* Free host memory */
free(h_A);
free(h_B);
free(h_C);
return 0;
} /* main */