@@ -269,18 +269,20 @@ def write_files_from_queues(sample_path, queues):
269
269
"""
270
270
os .makedirs (sample_path , exist_ok = True )
271
271
for c_name in queues .keys ():
272
- dest_base = dest_base = os .path .join (
272
+ dest_base = os .path .join (
273
273
sample_path , "{}.{}" .format (c_name , CorpusName .SAMPLE ))
274
274
with open (dest_base + ".src" , 'w' , encoding = "utf-8" ) as f_src ,\
275
275
open (dest_base + ".tgt" , 'w' , encoding = "utf-8" ) as f_tgt :
276
276
while True :
277
277
_next = False
278
- for i , q in enumerate ( queues [c_name ]) :
278
+ for q in queues [c_name ]:
279
279
item = q .get ()
280
+ if item == "blank" :
281
+ continue
280
282
if item == "break" :
281
283
_next = True
282
284
break
283
- j , src_line , tgt_line = item
285
+ _ , src_line , tgt_line = item
284
286
f_src .write (src_line + '\n ' )
285
287
f_tgt .write (tgt_line + '\n ' )
286
288
if _next :
@@ -299,6 +301,8 @@ def build_sub_vocab(corpora, transforms, opts, n_sample, stride, offset):
299
301
for i , item in enumerate (c_iter ):
300
302
maybe_example = DatasetAdapter ._process (item , is_train = True )
301
303
if maybe_example is None :
304
+ if opts .dump_samples :
305
+ build_sub_vocab .queues [c_name ][offset ].put ("blank" )
302
306
continue
303
307
src_line , tgt_line = maybe_example ['src' ], maybe_example ['tgt' ]
304
308
sub_counter_src .update (src_line .split (' ' ))
0 commit comments