2727
2828use arrow:: array:: { ArrayRef , BooleanBuilder , Int32Builder , Int64Builder , StringBuilder } ;
2929use arrow:: record_batch:: RecordBatch ;
30- use log:: debug;
30+ use log:: { debug, info } ;
3131use parquet:: arrow:: ArrowWriter ;
3232use parquet:: basic:: Compression ;
3333use parquet:: file:: properties:: WriterProperties ;
@@ -46,26 +46,48 @@ use crate::datatype::DataType;
4646use crate :: schema;
4747use crate :: trimmer:: { trimmer_factory, ColumnTrimmer } ;
4848use crate :: { chunked, trimmer} ;
49+ use arrow:: datatypes:: { Field , Schema , SchemaRef } ;
50+ use crossbeam:: atomic:: AtomicConsume ;
51+ use parquet:: errors:: { ParquetError , Result } ;
52+ use parquet:: file:: metadata:: FileMetaData ;
53+ use std:: sync:: atomic:: { AtomicUsize , Ordering , ATOMIC_USIZE_INIT } ;
54+ use std:: sync:: mpsc:: { sync_channel, Receiver , RecvError , SyncSender } ;
55+ use std:: thread;
56+ use std:: thread:: JoinHandle ;
4957use std:: time:: { Duration , Instant } ;
58+ use thread:: spawn;
59+ //use crossbeam::channel::{Receiver, Sender};
60+
61+ static GLOBAL_COUNTER : AtomicUsize = ATOMIC_USIZE_INIT ;
5062
5163pub ( crate ) struct Slice2Arrow < ' a > {
5264 // pub(crate) file_out: File,
53- pub ( crate ) writer : ArrowWriter < File > ,
5465 pub ( crate ) fn_line_break : FnFindLastLineBreak < ' a > ,
5566 pub ( crate ) fn_line_break_len : FnLineBreakLen ,
5667 pub ( crate ) masterbuilders : MasterBuilders ,
5768}
5869
5970pub ( crate ) struct MasterBuilders {
6071 builders : Vec < Vec < Box < dyn Sync + Send + ColumnBuilder > > > ,
61- // schema: arrow_schema::SchemaRef
72+ outfile : PathBuf ,
73+ sender : Option < SyncSender < RecordBatch > > ,
6274}
6375
6476unsafe impl Send for MasterBuilders { }
6577unsafe impl Sync for MasterBuilders { }
6678
6779impl MasterBuilders {
68- pub fn writer_factory ( & mut self , out_file : & PathBuf ) -> ArrowWriter < File > {
80+ pub fn schema_factory ( & mut self ) -> SchemaRef {
81+ let b: & mut Vec < Box < dyn Sync + Send + ColumnBuilder > > = self . builders . get_mut ( 0 ) . unwrap ( ) ;
82+ let mut br: Vec < ( & str , ArrayRef ) > = vec ! [ ] ;
83+ for bb in b. iter_mut ( ) {
84+ br. push ( bb. finish ( ) ) ;
85+ }
86+
87+ let batch = RecordBatch :: try_from_iter ( br) . unwrap ( ) ;
88+ batch. schema ( )
89+ }
90+ pub fn writer_factory ( out_file : & PathBuf , schema : SchemaRef ) -> ArrowWriter < File > {
6991 let _out_file = fs:: OpenOptions :: new ( )
7092 . create ( true )
7193 . append ( true )
@@ -76,19 +98,12 @@ impl MasterBuilders {
7698 . set_compression ( Compression :: SNAPPY )
7799 . build ( ) ;
78100
79- let b: & mut Vec < Box < dyn Sync + Send + ColumnBuilder > > = self . builders . get_mut ( 0 ) . unwrap ( ) ;
80- let mut br: Vec < ( & str , ArrayRef ) > = vec ! [ ] ;
81- for bb in b. iter_mut ( ) {
82- br. push ( bb. finish ( ) ) ;
83- }
84-
85- let batch = RecordBatch :: try_from_iter ( br) . unwrap ( ) ;
86101 let writer: ArrowWriter < File > =
87- ArrowWriter :: try_new ( _out_file, batch . schema ( ) , Some ( props. clone ( ) ) ) . unwrap ( ) ;
102+ ArrowWriter :: try_new ( _out_file, schema, Some ( props. clone ( ) ) ) . unwrap ( ) ;
88103 writer
89104 }
90105
91- pub fn builders_factory ( schema_path : PathBuf , instances : i16 ) -> Self {
106+ pub fn builders_factory ( out_file : PathBuf , schema_path : PathBuf , instances : i16 ) -> Self {
92107 let schema = schema:: FixedSchema :: from_path ( schema_path) . unwrap ( ) ;
93108 let antal_col = schema. num_columns ( ) ;
94109 let mut builders: Vec < Vec < Box < dyn ColumnBuilder + Sync + Send > > > = Vec :: new ( ) ;
@@ -152,7 +167,11 @@ impl MasterBuilders {
152167 }
153168 builders. push ( buildersmut) ;
154169 }
155- MasterBuilders { builders }
170+ MasterBuilders {
171+ builders : builders,
172+ outfile : out_file,
173+ sender : None ,
174+ }
156175 }
157176}
158177
@@ -197,6 +216,8 @@ impl<'a> Converter<'a> for Slice2Arrow<'a> {
197216 }
198217 parse_duration = start_parse. elapsed ( ) ;
199218
219+ let mut rb: Vec < RecordBatch > = Vec :: new ( ) ;
220+
200221 for b in self . masterbuilders . builders . iter_mut ( ) {
201222 let mut br: Vec < ( & str , ArrayRef ) > = vec ! [ ] ;
202223
@@ -205,24 +226,72 @@ impl<'a> Converter<'a> for Slice2Arrow<'a> {
205226 }
206227
207228 let start_builder_write = Instant :: now ( ) ;
208- let batch = RecordBatch :: try_from_iter ( br) . unwrap ( ) ;
229+ let record_batch = RecordBatch :: try_from_iter ( br) . unwrap ( ) ;
230+ // rb.push(RecordBatch::try_from_iter(br).unwrap());
209231
210- self . writer . write ( & batch) . expect ( "Error Writing batch" ) ;
211- bytes_out += self . writer . bytes_written ( ) ;
232+ let _ = self
233+ . masterbuilders
234+ . sender
235+ . clone ( )
236+ . unwrap ( )
237+ . send ( record_batch) ;
212238
213239 builder_write_duration += start_builder_write. elapsed ( ) ;
214240 }
241+ // let writer: ArrowWriter<File> =
242+
215243 debug ! ( "Batch write: accumulated bytes_written {}" , bytes_out) ;
216244
217245 ( bytes_in, bytes_out, parse_duration, builder_write_duration)
218246 }
219247
220- fn finish ( & mut self ) -> parquet:: errors:: Result < format:: FileMetaData > {
221- self . writer . finish ( )
248+ fn setup ( & mut self ) -> JoinHandle < Result < format:: FileMetaData > > {
249+ let schema = self . masterbuilders . schema_factory ( ) ;
250+ let _outfile = self . masterbuilders . outfile . clone ( ) ;
251+ let mut writer = crate :: chunked:: arrow_converter:: MasterBuilders :: writer_factory (
252+ & _outfile,
253+ schema. clone ( ) ,
254+ ) ;
255+ let ( sender, receiver) = sync_channel :: < RecordBatch > ( 1 ) ;
256+
257+ let t: JoinHandle < Result < format:: FileMetaData > > = thread:: spawn ( move || {
258+ ' outer: loop {
259+ let message = receiver. recv ( ) ;
260+ match message {
261+ Ok ( rb) => {
262+ writer. write ( & rb) . expect ( "Error Writing batch" ) ;
263+ if ( rb. num_rows ( ) == 0 ) {
264+ break ' outer;
265+ }
266+ }
267+ Err ( e) => {
268+ info ! ( "got RecvError in channel , break to outer" ) ;
269+
270+ break ' outer;
271+ }
272+ }
273+ }
274+ info ! ( "closing the writer for parquet" ) ;
275+
276+ writer. finish ( )
277+ } ) ;
278+ self . masterbuilders . sender = Some ( sender. clone ( ) ) ;
279+ t
280+
281+ // bytes_out += crate::chunked::arrow_converter::GLOBAL_COUNTER.load_consume();
222282 }
223283
224- fn get_finish_bytes_written ( & mut self ) -> usize {
225- self . writer . bytes_written ( )
284+ fn shutdown ( & mut self ) {
285+ // converter.shutdown();
286+ let schema = Schema :: new ( vec ! [ Field :: new(
287+ "id" ,
288+ arrow:: datatypes:: DataType :: Int32 ,
289+ false ,
290+ ) ] ) ;
291+
292+ let emptyrb = arrow:: record_batch:: RecordBatch :: new_empty ( Arc :: new ( schema) ) ;
293+
294+ let _ = & self . masterbuilders . sender . clone ( ) . unwrap ( ) . send ( emptyrb) ;
226295 }
227296}
228297
0 commit comments