@@ -25,7 +25,7 @@ void *rofi_get_remote_addr_internal(void *addr, unsigned int id) {
25
25
return NULL ;
26
26
}
27
27
28
- DEBUG_MSG ("\t Found MR [0x%lx - 0x%lx] Addr: %p Key: 0x%lx " , el -> start , el -> start + el -> size , (void * )(addr - (uintptr_t )el -> start + el -> iov [id ].addr ), el -> mr_key );
28
+ DEBUG_MSG ("\t Found MR [0x%lx - 0x%lx] Addr: %p Key: 0x%lx " , el -> start , el -> start + el -> size , (void * )(addr - (uintptr_t )el -> start + el -> iov [id ].addr ), el -> iov [ id ]. key );
29
29
return (void * )(addr - (uintptr_t )el -> start + el -> iov [id ].addr );
30
30
}
31
31
@@ -38,7 +38,7 @@ void *rofi_get_local_addr_from_remote_addr_internal(void *addr, unsigned int id)
38
38
return NULL ;
39
39
}
40
40
41
- DEBUG_MSG ("\t Found MR [0x%lx - 0x%lx] Addr: %p Key: 0x%lx" , el -> start , el -> start + el -> size , (void * )(addr - el -> iov [id ].addr + (uintptr_t )el -> start ), el -> mr_key );
41
+ DEBUG_MSG ("\t Found MR [0x%lx - 0x%lx] Addr: %p Key: 0x%lx" , el -> start , el -> start + el -> size , (void * )(addr - el -> iov [id ].addr + (uintptr_t )el -> start ), el -> iov [ id ]. key );
42
42
return (void * )(addr - el -> iov [id ].addr + (uintptr_t )el -> start );
43
43
}
44
44
@@ -206,10 +206,34 @@ int rofi_send_internal(unsigned long id, void *buf, size_t size, unsigned long f
206
206
int rofi_recv_internal (unsigned long id , void * buf , size_t size , unsigned long flags ) {
207
207
}
208
208
209
- int rofi_init_internal (char * prov ) {
210
- if (!prov ) {
211
- ERR_MSG ("ROFI provider not specified. Currently ROFI only supports \"verbs\"." );
209
+ rofi_names_t * rofi_parse_names_internal (char * names_list ) {
210
+ char token = ';' ;
211
+ int name_cnt = 0 ;
212
+ for (int i = 0 ; i < strlen (names_list ); i ++ ) {
213
+ if (names_list [i ] == token ) {
214
+ name_cnt ++ ;
215
+ }
216
+ }
217
+ name_cnt += 1 ;
218
+ char * * name_strs = (char * * )calloc (name_cnt , sizeof (char * ));
219
+
220
+ int p = 0 ;
221
+ int i = 0 ;
222
+ for (int k = 0 ; k < strlen (names_list ); k ++ ) {
223
+ if (names_list [k ] == token ) {
224
+ name_strs [p ] = strndup (& names_list [i ], k - i );
225
+ p ++ ;
226
+ i = k + 1 ;
227
+ }
212
228
}
229
+ name_strs [p ] = strndup (& names_list [i ], strlen (names_list ) - i );
230
+ rofi_names_t * names = (rofi_names_t * )calloc (1 , sizeof (rofi_names_t ));
231
+ names -> num = name_cnt ;
232
+ names -> names = name_strs ;
233
+ return names ;
234
+ }
235
+
236
+ int rofi_init_internal (char * provs , char * domains ) {
213
237
pthread_rwlock_init (& rofi .mr_lock , NULL );
214
238
pthread_mutex_init (& rofi .lock , NULL );
215
239
int ret = 0 ;
@@ -224,12 +248,7 @@ int rofi_init_internal(char *prov) {
224
248
rofi .desc .nodes = rt_get_size ();
225
249
rofi .desc .nid = rt_get_rank ();
226
250
227
- rofi .info = (struct fi_info * )calloc (1 , sizeof (struct fi_info ));
228
- if (!rofi .info ) {
229
- ERR_MSG ("Error allocating memory for rofi. Aborting." );
230
- ret = EXIT_FAILURE ;
231
- goto err ;
232
- }
251
+ rofi .info = NULL ;
233
252
234
253
DEBUG_MSG ("Initializing process %d/%d..." , rofi .desc .nid , rofi .desc .nodes );
235
254
@@ -238,7 +257,7 @@ int rofi_init_internal(char *prov) {
238
257
return EXIT_FAILURE ;
239
258
}
240
259
241
- hints -> caps = FI_RMA | FI_ATOMIC | FI_COLLECTIVE ; // eventually want FI_ATOMIC
260
+ hints -> caps = FI_RMA | FI_ATOMIC | FI_COLLECTIVE ;
242
261
hints -> addr_format = FI_FORMAT_UNSPEC ;
243
262
hints -> domain_attr -> resource_mgmt = FI_RM_ENABLED ;
244
263
hints -> domain_attr -> threading = FI_THREAD_DOMAIN ;
@@ -249,9 +268,21 @@ int rofi_init_internal(char *prov) {
249
268
hints -> ep_attr -> type = FI_EP_RDM ;
250
269
hints -> tx_attr -> op_flags = FI_DELIVERY_COMPLETE ; // maybe need to change this to FI_INJECT_COMPLETE or FI_TRANSMIT_COMPLETE
251
270
252
- if (prov ) {
253
- hints -> fabric_attr -> prov_name = strdup (prov );
271
+ rofi_names_t * prov_names = NULL ;
272
+ if (provs ) {
273
+ prov_names = rofi_parse_names_internal (provs );
254
274
}
275
+ // else {
276
+ // names = rofi_parse_names_internal("verbs");
277
+ // }
278
+
279
+ rofi_names_t * domain_names = NULL ;
280
+ if (domains ) {
281
+ domain_names = rofi_parse_names_internal (domains );
282
+ }
283
+ // else {
284
+ // rofi.domains = rofi_parse_names_internal("ib");
285
+ // }
255
286
256
287
// this isn't really needed for verbs since it is a connected endpoint
257
288
rofi .remote_addrs = (fi_addr_t * )malloc (rofi .desc .nodes * sizeof (fi_addr_t ));
@@ -264,11 +295,27 @@ int rofi_init_internal(char *prov) {
264
295
rofi .remote_addrs [i ] = i ;
265
296
}
266
297
267
- rofi_transport_init (hints , & rofi );
298
+ rofi_transport_init (hints , & rofi , prov_names , domain_names );
299
+
300
+ if (prov_names ) {
301
+ for (int i = 0 ; i < prov_names -> num ; i ++ ) {
302
+ free (prov_names -> names [i ]);
303
+ }
304
+ free (prov_names );
305
+ }
306
+
307
+ if (domain_names ) {
308
+ for (int i = 0 ; i < domain_names -> num ; i ++ ) {
309
+ free (domain_names -> names [i ]);
310
+ }
311
+ free (domain_names );
312
+ }
268
313
269
314
mr_init ();
270
315
uint64_t global_barrier_size = rofi .desc .nodes * sizeof (uint64_t );
271
- int rofi_mr_size = global_barrier_size ;
316
+ uint64_t sub_alloc_barrier_size = rofi .desc .nodes * sizeof (uint64_t );
317
+ uint64_t sub_alloc_size = rofi .desc .nodes * sizeof (struct fi_rma_iov );
318
+ int rofi_mr_size = global_barrier_size + sub_alloc_barrier_size + sub_alloc_size ;
272
319
273
320
rofi .mr = mr_add (& rofi , rofi_mr_size , 0 );
274
321
if (!rofi .mr ) {
@@ -284,9 +331,14 @@ int rofi_init_internal(char *prov) {
284
331
285
332
rofi .global_barrier_id = 0 ;
286
333
rofi .global_barrier_buf = (uint64_t * )rofi .mr -> start ;
334
+ rofi .sub_alloc_barrier_buf = (uint64_t * )(rofi .mr -> start + global_barrier_size );
335
+ rofi .sub_alloc_buf = (struct fi_rma_iov * )(rofi .mr -> start + global_barrier_size + sub_alloc_barrier_size );
287
336
288
337
for (int i = 0 ; i < rofi .desc .nid ; i ++ ) {
289
338
rofi .global_barrier_buf [i ] = 0 ;
339
+ rofi .sub_alloc_barrier_buf [i ] = 0 ;
340
+ rofi .sub_alloc_buf [i ].key = 0 ;
341
+ rofi .sub_alloc_buf [i ].addr = 0 ;
290
342
}
291
343
fi_freeinfo (hints );
292
344
return 0 ;
0 commit comments