23
23
* \author Hang Zhang
24
24
*/
25
25
#include " bilinear_resize-inl.h"
26
- // #include "elemwise_op_common.h"
27
26
#include " ../elemwise_op_common.h"
28
27
29
28
namespace mxnet {
@@ -44,56 +43,66 @@ void SpatialUpSamplingBilinearUpdateOutput(mshadow::Stream<cpu> *s,
44
43
int inputHeight = itensor.size (2 );
45
44
int inputWidth = itensor.size (3 );
46
45
46
+ const auto nthreads = engine::OpenMP::Get ()->GetRecommendedOMPThreadCount ();
47
+
47
48
DType *idata = itensor.dptr_ ;
48
49
DType *odata = otensor.dptr_ ;
49
50
channels = nbatch * channels;
51
+ const int input_elems_per_channel = inputWidth * inputHeight;
52
+ const int output_elems_per_channel = outputWidth * outputHeight;
53
+
50
54
// special case: just copy
51
55
if (inputHeight == outputHeight && inputWidth == outputWidth) {
52
- for (int h2 = 0 ; h2 < outputHeight; ++h2) {
56
+ #pragma omp parallel for num_threads(nthreads)
57
+ for (int index = 0 ; index < output_elems_per_channel; index++) {
58
+ const int h2 = index / outputWidth;
53
59
const int h1 = h2;
54
- for (int w2 = 0 ; w2 < outputWidth; ++w2) {
55
- const int w1 = w2;
56
- const DType* pos1 = &idata[h1 * inputWidth + w1];
57
- DType* pos2 = &odata[h2 * outputWidth + w2];
58
- for (int c = 0 ; c < channels; ++c) {
59
- pos2[0 ] = pos1[0 ];
60
- pos1 += inputWidth * inputHeight;
61
- pos2 += outputWidth * outputHeight;
62
- }
60
+ const int w2 = index % outputWidth;
61
+ const int w1 = w2;
62
+ const DType* pos1 = &idata[h1 * inputWidth + w1];
63
+ DType* pos2 = &odata[index];
64
+ for (int c = 0 ; c < channels; ++c) {
65
+ *pos2 = *pos1;
66
+ pos1 += input_elems_per_channel;
67
+ pos2 += output_elems_per_channel;
63
68
}
64
69
}
65
70
return ;
66
71
}
72
+
67
73
const float rheight =(outputHeight > 1 ) ? static_cast <float >(inputHeight - 1 )/
68
74
(outputHeight - 1 ) : 0 .f ;
69
75
const float rwidth = (outputWidth > 1 ) ? static_cast <float >(inputWidth - 1 ) /
70
76
(outputWidth - 1 ) : 0 .f ;
71
- for (int h2 = 0 ; h2 < outputHeight; ++h2) {
77
+ #pragma omp parallel for num_threads(nthreads)
78
+ for (int index = 0 ; index < output_elems_per_channel; index++) {
79
+ const int h2 = index / outputWidth;
80
+ const int w2 = index % outputWidth;
81
+
72
82
const float h1r = rheight * h2;
73
83
const int h1 = h1r;
74
84
const int h1p = (h1 < inputHeight - 1 ) ? 1 : 0 ;
75
85
const DType h1lambda = h1r - h1;
76
86
const DType h0lambda = (DType)1 . - h1lambda;
77
- for ( int w2 = 0 ; w2 < outputWidth; ++w2) {
78
- const float w1r = rwidth * w2;
79
- const int w1 = w1r;
80
- const int w1p = (w1 < inputWidth - 1 ) ? 1 : 0 ;
81
- const DType w1lambda = w1r - w1;
82
- const DType w0lambda = (DType)1 . - w1lambda;
83
- const DType* pos1 = &idata[h1 * inputWidth + w1];
84
- DType* pos2 = &odata[h2 * outputWidth + w2 ];
85
- for ( int c = 0 ; c < channels; ++c) {
86
- pos2[ 0 ] = h0lambda * (w0lambda * pos1[ 0 ]+ w1lambda * pos1[w1p])
87
- + h1lambda * (w0lambda * pos1[h1p * inputWidth]
88
- + w1lambda * pos1[h1p * inputWidth + w1p]);
89
- pos1 += inputWidth * inputHeight ;
90
- pos2 += outputWidth * outputHeight ;
91
- }
87
+
88
+ const float w1r = rwidth * w2;
89
+ const int w1 = w1r;
90
+ const int w1p = (w1 < inputWidth - 1 ) ? 1 : 0 ;
91
+ const DType w1lambda = w1r - w1;
92
+ const DType w0lambda = (DType)1 . - w1lambda;
93
+ const DType* pos1 = &idata[h1 * inputWidth + w1];
94
+ DType* pos2 = &odata[index ];
95
+
96
+ for ( int c = 0 ; c < channels; ++c) {
97
+ *pos2 = h0lambda * (w0lambda * (*pos1) + w1lambda * *( pos1 + w1p))
98
+ + h1lambda * (w0lambda * *(pos1 + h1p * inputWidth)
99
+ + w1lambda * *( pos1 + h1p * inputWidth + w1p)) ;
100
+ pos1 += input_elems_per_channel ;
101
+ pos2 += output_elems_per_channel;
92
102
}
93
103
}
94
104
}
95
105
96
-
97
106
template <typename xpu, typename DType, typename AccReal>
98
107
void SpatialUpSamplingBilinearUpdateGradInput (mshadow::Stream<cpu> *s,
99
108
const std::vector<TBlob> &input,
@@ -109,23 +118,28 @@ void SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream<cpu> *s,
109
118
int inputHeight = gradInput.size (2 );
110
119
int inputWidth = gradInput.size (3 );
111
120
121
+ const auto nthreads = engine::OpenMP::Get ()->GetRecommendedOMPThreadCount ();
122
+
112
123
DType *dataInput = gradInput.dptr_ ;
113
124
DType *dataOutput = gradOutput.dptr_ ;
114
125
channels = nbatch * channels;
126
+ const int input_elems_per_channel = inputWidth * inputHeight;
127
+ const int output_elems_per_channel = outputWidth * outputHeight;
115
128
116
129
// special case: same-size matching grids
117
130
if (inputHeight == outputHeight && inputWidth == outputWidth) {
118
- for (int h2 = 0 ; h2 < outputHeight; ++h2) {
131
+ #pragma omp parallel for num_threads(nthreads)
132
+ for (int index = 0 ; index < output_elems_per_channel; index++) {
133
+ const int h2 = index / outputWidth;
119
134
const int h1 = h2;
120
- for (int w2 = 0 ; w2 < outputWidth; ++w2) {
121
- const int w1 = w2;
122
- DType* pos1 = &dataInput[h1 * inputWidth + w1];
123
- const DType* pos2 = &dataOutput[h2 * outputWidth + w2];
124
- for (int c = 0 ; c < channels; ++c) {
125
- pos1[0 ] += pos2[0 ];
126
- pos1 += inputWidth * inputHeight;
127
- pos2 += outputWidth * outputHeight;
128
- }
135
+ const int w2 = index % outputWidth;
136
+ const int w1 = w2;
137
+ DType* pos1 = &dataInput[h1 * inputWidth + w1];
138
+ const DType* pos2 = &dataOutput[index];
139
+ for (int c = 0 ; c < channels; ++c) {
140
+ *pos1 += *pos2;
141
+ pos1 += input_elems_per_channel;
142
+ pos2 += output_elems_per_channel;
129
143
}
130
144
}
131
145
return ;
@@ -134,28 +148,36 @@ void SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream<cpu> *s,
134
148
(outputHeight - 1 ) : 0 .f ;
135
149
const float rwidth = (outputWidth > 1 ) ? static_cast <float >(inputWidth - 1 )/
136
150
(outputWidth - 1 ) : 0 .f ;
137
- for (int h2 = 0 ; h2 < outputHeight; ++h2) {
151
+
152
+ #pragma omp parallel for num_threads(nthreads)
153
+ for (int index = 0 ; index < output_elems_per_channel; index++) {
154
+ const int h2 = index / outputWidth;
155
+ const int w2 = index % outputWidth;
156
+
138
157
const float h1r = rheight * h2;
139
158
const int h1 = h1r;
140
159
const int h1p = (h1 < inputHeight - 1 ) ? 1 : 0 ;
141
160
const DType h1lambda = h1r - h1;
142
161
const DType h0lambda = (DType)1 . - h1lambda;
143
- for (int w2 = 0 ; w2 < outputWidth; ++w2) {
144
- const float w1r = rwidth * w2;
145
- const int w1 = w1r;
146
- const int w1p = (w1 < inputWidth - 1 ) ? 1 : 0 ;
147
- const DType w1lambda = w1r - w1;
148
- const DType w0lambda = (DType)1 . - w1lambda;
149
- DType* posInput = &dataInput[h1 * inputWidth + w1];
150
- const DType* posOutput = &dataOutput[h2 * outputWidth + w2];
151
- for (int c = 0 ; c < channels; ++c) {
152
- posInput[0 ] += h0lambda * w0lambda * posOutput[0 ];
153
- posInput[w1p] += h0lambda * w1lambda * posOutput[0 ];
154
- posInput[h1p * inputWidth] += h1lambda * w0lambda * posOutput[0 ];
155
- posInput[h1p * inputWidth + w1p] += h1lambda * w1lambda * posOutput[0 ];
156
- posInput += inputWidth * inputHeight;
157
- posOutput += outputWidth * outputHeight;
162
+
163
+ const float w1r = rwidth * w2;
164
+ const int w1 = w1r;
165
+ const int w1p = (w1 < inputWidth - 1 ) ? 1 : 0 ;
166
+ const DType w1lambda = w1r - w1;
167
+ const DType w0lambda = (DType)1 . - w1lambda;
168
+
169
+ DType* posInput = &dataInput[h1 * inputWidth + w1];
170
+ const DType* posOutput = &dataOutput[index];
171
+ for (int c = 0 ; c < channels; ++c) {
172
+ #pragma omp critical
173
+ {
174
+ *posInput += h0lambda * w0lambda * (*posOutput);
175
+ *(posInput + w1p) += h0lambda * w1lambda * (*posOutput);
176
+ *(posInput + h1p * inputWidth) += h1lambda * w0lambda * (*posOutput);
177
+ *(posInput + h1p * inputWidth + w1p) += h1lambda * w1lambda * (*posOutput);
158
178
}
179
+ posInput += input_elems_per_channel;
180
+ posOutput += output_elems_per_channel;
159
181
}
160
182
}
161
183
@@ -165,19 +187,19 @@ void SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream<cpu> *s,
165
187
int inputWidthLike = gradInputLike.size (3 );
166
188
DType *dataInputLike = gradInputLike.dptr_ ;
167
189
int channelsLike = nbatch * gradInputLike.size (1 );
168
- for (int h_like = 0 ; h_like < inputHeightLike; ++h_like) {
169
- for (int w_like = 0 ; w_like < inputWidthLike; ++w_like) {
170
- DType *posInput = &dataInputLike[h_like * inputWidthLike + w_like];
171
- for (int c = 0 ; c < channelsLike; ++c) {
172
- posInput[0 ] = 0 ;
173
- posInput += inputWidthLike * inputHeightLike;
174
- }
190
+
191
+ const int inputLike_elems_per_channel = inputHeightLike * inputWidthLike;
192
+ #pragma omp parallel for num_threads(nthreads)
193
+ for (int index = 0 ; index < inputLike_elems_per_channel; index++) {
194
+ DType *posInput = &dataInputLike[index];
195
+ for (int c = 0 ; c < channelsLike; ++c) {
196
+ *posInput = 0 ;
197
+ posInput += inputLike_elems_per_channel;
175
198
}
176
199
}
177
200
}
178
201
}
179
202
180
-
181
203
DMLC_REGISTER_PARAMETER (BilinearSampleParam);
182
204
183
205
NNVM_REGISTER_OP (_contrib_BilinearResize2D)
0 commit comments