2525// Last updated: 2024-05-15
2626//
2727
28+ use crate :: chunked:: trimmer:: { trimmer_factory, ColumnTrimmer } ;
2829use arrow:: array:: { ArrayRef , BooleanBuilder , Int32Builder , Int64Builder , StringBuilder } ;
2930use arrow:: record_batch:: RecordBatch ;
3031use log:: { debug, info} ;
@@ -38,22 +39,28 @@ use rayon::prelude::*;
3839
3940use std:: fs;
4041use std:: fs:: File ;
42+ use std:: ops:: Deref ;
4143use std:: path:: PathBuf ;
4244use std:: str:: from_utf8_unchecked;
4345use std:: sync:: Arc ;
4446
45- use super :: { ColumnBuilder , Converter , FnFindLastLineBreak , FnLineBreakLen } ;
47+ use super :: {
48+ arrow_file_output, trimmer, ColumnBuilder , Converter , FnFindLastLineBreak , FnLineBreakLen ,
49+ Stats ,
50+ } ;
51+ use crate :: chunked;
52+ use crate :: chunked:: threaded_file_output:: { ipc_file_out, output_factory, parquet_file_out} ;
53+ pub use crate :: cli:: Targets ;
4654use crate :: datatype:: DataType ;
4755use crate :: schema;
48- use crate :: trimmer:: { trimmer_factory, ColumnTrimmer } ;
49- use crate :: { chunked, trimmer} ;
5056use arrow:: datatypes:: { Field , Schema , SchemaRef } ;
5157use atomic_counter:: { AtomicCounter , ConsistentCounter } ;
5258use crossbeam:: atomic:: AtomicConsume ;
5359use libc:: bsearch;
5460use ordered_channel:: Sender ;
5561use parquet:: errors:: { ParquetError , Result } ;
5662use parquet:: file:: metadata:: FileMetaData ;
63+ use rayon:: join;
5764use std:: sync:: atomic:: { AtomicUsize , Ordering , ATOMIC_USIZE_INIT } ;
5865use std:: sync:: mpsc:: { sync_channel, Receiver , RecvError , SyncSender } ;
5966use std:: thread;
@@ -71,6 +78,7 @@ pub(crate) struct Slice2Arrow<'a> {
7178 pub ( crate ) fn_line_break_len : FnLineBreakLen ,
7279 pub ( crate ) masterbuilders : MasterBuilders ,
7380 pub ( crate ) consistent_counter : ConsistentCounter ,
81+ pub ( crate ) target : Targets ,
7482}
7583
7684pub ( crate ) struct MasterBuilders {
@@ -93,21 +101,6 @@ impl MasterBuilders {
93101 let batch = RecordBatch :: try_from_iter ( br) . unwrap ( ) ;
94102 batch. schema ( )
95103 }
96- pub fn writer_factory ( out_file : & PathBuf , schema : SchemaRef ) -> ArrowWriter < File > {
97- let _out_file = fs:: OpenOptions :: new ( )
98- . create ( true )
99- . append ( true )
100- . open ( out_file)
101- . expect ( "aaa" ) ;
102-
103- let props = WriterProperties :: builder ( )
104- . set_compression ( Compression :: SNAPPY )
105- . build ( ) ;
106-
107- let writer: ArrowWriter < File > =
108- ArrowWriter :: try_new ( _out_file, schema, Some ( props. clone ( ) ) ) . unwrap ( ) ;
109- writer
110- }
111104
112105 pub fn builders_factory ( out_file : PathBuf , schema_path : PathBuf , instances : i16 ) -> Self {
113106 let schema = schema:: FixedSchema :: from_path ( schema_path) . unwrap ( ) ;
@@ -242,40 +235,14 @@ impl<'a> Converter<'a> for Slice2Arrow<'a> {
242235 ( bytes_in, bytes_out, parse_duration, builder_write_duration)
243236 }
244237
245- fn setup ( & mut self ) -> JoinHandle < Result < format:: FileMetaData > > {
246- let schema = self . masterbuilders . schema_factory ( ) ;
247- let _outfile = self . masterbuilders . outfile . clone ( ) ;
248- let mut writer = crate :: chunked:: arrow_converter:: MasterBuilders :: writer_factory (
249- & _outfile,
250- schema. clone ( ) ,
238+ fn setup ( & mut self ) -> ( Sender < RecordBatch > , JoinHandle < Result < Stats > > ) {
239+ let o = output_factory (
240+ self . target . clone ( ) ,
241+ self . masterbuilders . schema_factory ( ) ,
242+ self . masterbuilders . outfile . clone ( ) ,
251243 ) ;
252-
253- let ( sender, mut receiver) = bounded :: < RecordBatch > ( 100 ) ;
254-
255- let t: JoinHandle < Result < format:: FileMetaData > > = thread:: spawn ( move || {
256- ' outer: loop {
257- let mut message = receiver. recv ( ) ;
258-
259- match message {
260- Ok ( rb) => {
261- writer. write ( & rb) . expect ( "Error Writing batch" ) ;
262- if ( rb. num_rows ( ) == 0 ) {
263- break ' outer;
264- }
265- }
266- Err ( e) => {
267- info ! ( "got RecvError in channel , break to outer" ) ;
268- break ' outer;
269- }
270- }
271- }
272- info ! ( "closing the writer for parquet" ) ;
273- writer. finish ( )
274- } ) ;
275- self . masterbuilders . sender = Some ( sender. clone ( ) ) ;
276- t
277-
278- // bytes_out += crate::chunked::arrow_converter::GLOBAL_COUNTER.load_consume();
244+ self . masterbuilders . sender = Some ( o. 0 . clone ( ) ) ;
245+ o
279246 }
280247
281248 fn shutdown ( & mut self ) {
0 commit comments