Skip to content

Commit 9695b3b

Browse files
committed
support_large_arrays
1 parent 01224e7 commit 9695b3b

37 files changed

+184
-162
lines changed

src/StreamModels.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
#endif
3737

3838
template <typename T>
39-
std::unique_ptr<Stream<T>> make_stream(int array_size, int deviceIndex) {
39+
std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
4040
#if defined(CUDA)
4141
// Use the CUDA implementation
4242
return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);

src/acc/ACCStream.cpp

+24-18
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,12 @@
88
#include "ACCStream.h"
99

1010
template <class T>
11-
ACCStream<T>::ACCStream(const int ARRAY_SIZE, int device)
11+
ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
12+
: array_size{ARRAY_SIZE}
1213
{
1314
acc_device_t device_type = acc_get_device_type();
1415
acc_set_device_num(device, device_type);
1516

16-
array_size = ARRAY_SIZE;
17-
1817
// Set up data region on device
1918
this->a = new T[array_size];
2019
this->b = new T[array_size];
@@ -32,7 +31,7 @@ template <class T>
3231
ACCStream<T>::~ACCStream()
3332
{
3433
// End data region on device
35-
int array_size = this->array_size;
34+
intptr_t array_size = this->array_size;
3635

3736
T * restrict a = this->a;
3837
T * restrict b = this->b;
@@ -49,12 +48,12 @@ ACCStream<T>::~ACCStream()
4948
template <class T>
5049
void ACCStream<T>::init_arrays(T initA, T initB, T initC)
5150
{
52-
int array_size = this->array_size;
51+
intptr_t array_size = this->array_size;
5352
T * restrict a = this->a;
5453
T * restrict b = this->b;
5554
T * restrict c = this->c;
5655
#pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
57-
for (int i = 0; i < array_size; i++)
56+
for (intptr_t i = 0; i < array_size; i++)
5857
{
5958
a[i] = initA;
6059
b[i] = initB;
@@ -70,16 +69,23 @@ void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
7069
T *c = this->c;
7170
#pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
7271
{}
72+
73+
for (intptr_t i = 0; i < array_size; i++)
74+
{
75+
h_a[i] = a[i];
76+
h_b[i] = b[i];
77+
h_c[i] = c[i];
78+
}
7379
}
7480

7581
template <class T>
7682
void ACCStream<T>::copy()
7783
{
78-
int array_size = this->array_size;
84+
intptr_t array_size = this->array_size;
7985
T * restrict a = this->a;
8086
T * restrict c = this->c;
8187
#pragma acc parallel loop present(a[0:array_size], c[0:array_size]) wait
82-
for (int i = 0; i < array_size; i++)
88+
for (intptr_t i = 0; i < array_size; i++)
8389
{
8490
c[i] = a[i];
8591
}
@@ -90,11 +96,11 @@ void ACCStream<T>::mul()
9096
{
9197
const T scalar = startScalar;
9298

93-
int array_size = this->array_size;
99+
intptr_t array_size = this->array_size;
94100
T * restrict b = this->b;
95101
T * restrict c = this->c;
96102
#pragma acc parallel loop present(b[0:array_size], c[0:array_size]) wait
97-
for (int i = 0; i < array_size; i++)
103+
for (intptr_t i = 0; i < array_size; i++)
98104
{
99105
b[i] = scalar * c[i];
100106
}
@@ -103,12 +109,12 @@ void ACCStream<T>::mul()
103109
template <class T>
104110
void ACCStream<T>::add()
105111
{
106-
int array_size = this->array_size;
112+
intptr_t array_size = this->array_size;
107113
T * restrict a = this->a;
108114
T * restrict b = this->b;
109115
T * restrict c = this->c;
110116
#pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
111-
for (int i = 0; i < array_size; i++)
117+
for (intptr_t i = 0; i < array_size; i++)
112118
{
113119
c[i] = a[i] + b[i];
114120
}
@@ -119,12 +125,12 @@ void ACCStream<T>::triad()
119125
{
120126
const T scalar = startScalar;
121127

122-
int array_size = this->array_size;
128+
intptr_t array_size = this->array_size;
123129
T * restrict a = this->a;
124130
T * restrict b = this->b;
125131
T * restrict c = this->c;
126132
#pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
127-
for (int i = 0; i < array_size; i++)
133+
for (intptr_t i = 0; i < array_size; i++)
128134
{
129135
a[i] = b[i] + scalar * c[i];
130136
}
@@ -135,12 +141,12 @@ void ACCStream<T>::nstream()
135141
{
136142
const T scalar = startScalar;
137143

138-
int array_size = this->array_size;
144+
intptr_t array_size = this->array_size;
139145
T * restrict a = this->a;
140146
T * restrict b = this->b;
141147
T * restrict c = this->c;
142148
#pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
143-
for (int i = 0; i < array_size; i++)
149+
for (intptr_t i = 0; i < array_size; i++)
144150
{
145151
a[i] += b[i] + scalar * c[i];
146152
}
@@ -151,11 +157,11 @@ T ACCStream<T>::dot()
151157
{
152158
T sum{};
153159

154-
int array_size = this->array_size;
160+
intptr_t array_size = this->array_size;
155161
T * restrict a = this->a;
156162
T * restrict b = this->b;
157163
#pragma acc parallel loop reduction(+:sum) present(a[0:array_size], b[0:array_size]) wait
158-
for (int i = 0; i < array_size; i++)
164+
for (intptr_t i = 0; i < array_size; i++)
159165
{
160166
sum += a[i] * b[i];
161167
}

src/acc/ACCStream.h

+7-11
Original file line numberDiff line numberDiff line change
@@ -19,24 +19,23 @@
1919
template <class T>
2020
class ACCStream : public Stream<T>
2121
{
22-
23-
struct A{
24-
T *a;
25-
T *b;
26-
T *c;
27-
};
22+
struct A{
23+
T *a;
24+
T *b;
25+
T *c;
26+
};
2827

2928
protected:
3029
// Size of arrays
31-
int array_size;
30+
intptr_t array_size;
3231
A aa;
3332
// Device side pointers
3433
T *a;
3534
T *b;
3635
T *c;
3736

3837
public:
39-
ACCStream(const int, int);
38+
ACCStream(const intptr_t, int);
4039
~ACCStream();
4140

4241
virtual void copy() override;
@@ -48,7 +47,4 @@ class ACCStream : public Stream<T>
4847

4948
virtual void init_arrays(T initA, T initB, T initC) override;
5049
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
51-
52-
53-
5450
};

src/cuda/CUDAStream.cu

+17-17
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ __host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a +
2020
cudaStream_t stream;
2121

2222
template <class T>
23-
CUDAStream<T>::CUDAStream(const int array_size, const int device_index)
23+
CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
2424
: array_size(array_size)
2525
{
2626
// Set device
@@ -96,9 +96,9 @@ CUDAStream<T>::~CUDAStream()
9696
}
9797

9898
template <typename T>
99-
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, int array_size)
99+
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, size_t array_size)
100100
{
101-
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
101+
for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
102102
a[i] = initA;
103103
b[i] = initB;
104104
c[i] = initC;
@@ -120,7 +120,7 @@ void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
120120
// Copy device memory to host
121121
#if defined(PAGEFAULT) || defined(MANAGED)
122122
CU(cudaStreamSynchronize(stream));
123-
for (int i = 0; i < array_size; ++i)
123+
for (intptr_t i = 0; i < array_size; ++i)
124124
{
125125
a[i] = d_a[i];
126126
b[i] = d_b[i];
@@ -134,9 +134,9 @@ void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
134134
}
135135

136136
template <typename T>
137-
__global__ void copy_kernel(const T * a, T * c, int array_size)
137+
__global__ void copy_kernel(const T * a, T * c, size_t array_size)
138138
{
139-
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
139+
for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
140140
c[i] = a[i];
141141
}
142142
}
@@ -151,10 +151,10 @@ void CUDAStream<T>::copy()
151151
}
152152

153153
template <typename T>
154-
__global__ void mul_kernel(T * b, const T * c, int array_size)
154+
__global__ void mul_kernel(T * b, const T * c, size_t array_size)
155155
{
156156
const T scalar = startScalar;
157-
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
157+
for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
158158
b[i] = scalar * c[i];
159159
}
160160
}
@@ -169,9 +169,9 @@ void CUDAStream<T>::mul()
169169
}
170170

171171
template <typename T>
172-
__global__ void add_kernel(const T * a, const T * b, T * c, int array_size)
172+
__global__ void add_kernel(const T * a, const T * b, T * c, size_t array_size)
173173
{
174-
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
174+
for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
175175
c[i] = a[i] + b[i];
176176
}
177177
}
@@ -186,10 +186,10 @@ void CUDAStream<T>::add()
186186
}
187187

188188
template <typename T>
189-
__global__ void triad_kernel(T * a, const T * b, const T * c, int array_size)
189+
__global__ void triad_kernel(T * a, const T * b, const T * c, size_t array_size)
190190
{
191191
const T scalar = startScalar;
192-
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
192+
for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
193193
a[i] = b[i] + scalar * c[i];
194194
}
195195
}
@@ -204,10 +204,10 @@ void CUDAStream<T>::triad()
204204
}
205205

206206
template <typename T>
207-
__global__ void nstream_kernel(T * a, const T * b, const T * c, int array_size)
207+
__global__ void nstream_kernel(T * a, const T * b, const T * c, size_t array_size)
208208
{
209209
const T scalar = startScalar;
210-
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
210+
for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
211211
a[i] += b[i] + scalar * c[i];
212212
}
213213
}
@@ -222,12 +222,12 @@ void CUDAStream<T>::nstream()
222222
}
223223

224224
template <class T>
225-
__global__ void dot_kernel(const T * a, const T * b, T* sums, int array_size)
225+
__global__ void dot_kernel(const T * a, const T * b, T* sums, size_t array_size)
226226
{
227227
__shared__ T smem[TBSIZE];
228228
T tmp = T(0.);
229229
const size_t tidx = threadIdx.x;
230-
for (int i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
230+
for (size_t i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
231231
tmp += a[i] * b[i];
232232
}
233233
smem[tidx] = tmp;
@@ -249,7 +249,7 @@ T CUDAStream<T>::dot()
249249
CU(cudaStreamSynchronize(stream));
250250

251251
T sum = 0.0;
252-
for (int i = 0; i < dot_num_blocks; ++i) sum += sums[i];
252+
for (intptr_t i = 0; i < dot_num_blocks; ++i) sum += sums[i];
253253

254254
return sum;
255255
}

src/cuda/CUDAStream.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class CUDAStream : public Stream<T>
2222
{
2323
protected:
2424
// Size of arrays
25-
int array_size;
25+
intptr_t array_size;
2626

2727
// Host array for partial sums for dot kernel
2828
T *sums;
@@ -33,10 +33,10 @@ class CUDAStream : public Stream<T>
3333
T *d_c;
3434

3535
// Number of blocks for dot kernel
36-
int dot_num_blocks;
36+
intptr_t dot_num_blocks;
3737

3838
public:
39-
CUDAStream(const int, const int);
39+
CUDAStream(const intptr_t, const int);
4040
~CUDAStream();
4141

4242
virtual void copy() override;

0 commit comments

Comments
 (0)