2424// File created: 2023-12-11
2525// Last updated: 2024-05-15
2626//
27- use sorted_list:: SortedList ;
2827
2928use arrow:: array:: { ArrayRef , BooleanBuilder , Int32Builder , Int64Builder , StringBuilder } ;
3029use arrow:: record_batch:: RecordBatch ;
3130use log:: { debug, info} ;
31+ use ordered_channel:: bounded;
3232use parquet:: arrow:: ArrowWriter ;
3333use parquet:: basic:: Compression ;
3434use parquet:: file:: properties:: WriterProperties ;
@@ -48,8 +48,10 @@ use crate::schema;
4848use crate :: trimmer:: { trimmer_factory, ColumnTrimmer } ;
4949use crate :: { chunked, trimmer} ;
5050use arrow:: datatypes:: { Field , Schema , SchemaRef } ;
51+ use atomic_counter:: { AtomicCounter , ConsistentCounter } ;
5152use crossbeam:: atomic:: AtomicConsume ;
5253use libc:: bsearch;
54+ use ordered_channel:: Sender ;
5355use parquet:: errors:: { ParquetError , Result } ;
5456use parquet:: file:: metadata:: FileMetaData ;
5557use std:: sync:: atomic:: { AtomicUsize , Ordering , ATOMIC_USIZE_INIT } ;
@@ -58,6 +60,7 @@ use std::thread;
5860use std:: thread:: JoinHandle ;
5961use std:: time:: { Duration , Instant } ;
6062use thread:: spawn;
63+ //use ordered_channel::Sender;
6164//use crossbeam::channel::{Receiver, Sender};
6265
6366static GLOBAL_COUNTER : AtomicUsize = ATOMIC_USIZE_INIT ;
@@ -67,17 +70,13 @@ pub(crate) struct Slice2Arrow<'a> {
6770 pub ( crate ) fn_line_break : FnFindLastLineBreak < ' a > ,
6871 pub ( crate ) fn_line_break_len : FnLineBreakLen ,
6972 pub ( crate ) masterbuilders : MasterBuilders ,
73+ pub ( crate ) consistent_counter : ConsistentCounter ,
7074}
7175
7276pub ( crate ) struct MasterBuilders {
7377 builders : Vec < Vec < Box < dyn Sync + Send + ColumnBuilder > > > ,
7478 outfile : PathBuf ,
75- sender : Option < SyncSender < OrderedRecordBatch > > ,
76- }
77-
78- pub ( crate ) struct OrderedRecordBatch {
79- record_batch : RecordBatch ,
80- batch_nr : i32 ,
79+ sender : Option < Sender < RecordBatch > > ,
8180}
8281
8382unsafe impl Send for MasterBuilders { }
@@ -197,6 +196,7 @@ impl<'a> Converter<'a> for Slice2Arrow<'a> {
197196 let mut builder_write_duration: Duration = Duration :: new ( 0 , 0 ) ;
198197
199198 let start_parse = Instant :: now ( ) ;
199+ let offset: usize = self . consistent_counter . get ( ) ;
200200
201201 let arc_slices = Arc :: new ( & slices) ;
202202 self . masterbuilders
@@ -219,23 +219,24 @@ impl<'a> Converter<'a> for Slice2Arrow<'a> {
219219 for bb in n. iter_mut ( ) {
220220 br. push ( bb. finish ( ) ) ;
221221 }
222- let record_batch = OrderedRecordBatch {
223- record_batch : RecordBatch :: try_from_iter ( br) . unwrap ( ) ,
224- batch_nr : i as i32 ,
225- } ;
222+ let record_batch = RecordBatch :: try_from_iter ( br) . unwrap ( ) ;
223+
226224 let _ = self
227225 . masterbuilders
228226 . sender
229227 . clone ( )
230228 . unwrap ( )
231- . send ( record_batch) ;
229+ . send ( offset + i , record_batch) ;
232230 }
233231 }
234232 } ) ;
235233
236234 for ii in slices. iter ( ) {
237235 bytes_in += ii. len ( ) ;
238236 }
237+ let offset: usize = self
238+ . consistent_counter
239+ . add ( self . masterbuilders . builders . len ( ) ) ;
239240
240241 parse_duration = start_parse. elapsed ( ) ;
241242 ( bytes_in, bytes_out, parse_duration, builder_write_duration)
@@ -248,16 +249,18 @@ impl<'a> Converter<'a> for Slice2Arrow<'a> {
248249 & _outfile,
249250 schema. clone ( ) ,
250251 ) ;
251- let ( sender, receiver) = sync_channel :: < OrderedRecordBatch > ( 100 ) ;
252+
253+ let ( sender, mut receiver) = bounded :: < RecordBatch > ( 10 ) ;
254+ // let (sender, receiver) = sync_channel::<RecordBatch>(100);
252255
253256 let t: JoinHandle < Result < format:: FileMetaData > > = thread:: spawn ( move || {
254257 ' outer: loop {
255- let message = receiver. recv ( ) ;
258+ let mut message = receiver. recv ( ) ;
256259
257260 match message {
258261 Ok ( rb) => {
259- writer. write ( & rb. record_batch ) . expect ( "Error Writing batch" ) ;
260- if ( rb. record_batch . num_rows ( ) == 0 ) {
262+ writer. write ( & rb) . expect ( "Error Writing batch" ) ;
263+ if ( rb. num_rows ( ) == 0 ) {
261264 break ' outer;
262265 }
263266 }
@@ -284,12 +287,9 @@ impl<'a> Converter<'a> for Slice2Arrow<'a> {
284287 false ,
285288 ) ] ) ;
286289
287- let emptyrb = OrderedRecordBatch {
288- record_batch : arrow:: record_batch:: RecordBatch :: new_empty ( Arc :: new ( schema) ) ,
289- batch_nr : 0 ,
290- } ;
291-
292- let _ = & self . masterbuilders . sender . clone ( ) . unwrap ( ) . send ( emptyrb) ;
290+ let emptyrb = arrow:: record_batch:: RecordBatch :: new_empty ( Arc :: new ( schema) ) ;
291+ let c = self . consistent_counter . get ( ) ;
292+ let _ = & self . masterbuilders . sender . clone ( ) . unwrap ( ) . send ( c, emptyrb) ;
293293 }
294294}
295295
0 commit comments