Skip to content

Commit bc6136f

Browse files
authored
Merge pull request #39 from firelink-data/feature/name_refactor
Feature/name refactor
2 parents 344fcc1 + 81fccbe commit bc6136f

7 files changed

Lines changed: 408 additions & 108 deletions

File tree

src/chunked/arrow_converter.rs

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,10 @@ use std::str::from_utf8_unchecked;
4242
use std::sync::Arc;
4343

4444
use super::{ColumnBuilder, Converter, FnFindLastLineBreak, FnLineBreakLen};
45-
use crate::chunked;
4645
use crate::datatype::DataType;
4746
use crate::schema;
47+
use crate::trimmer::{trimmer_factory, ColumnTrimmer};
48+
use crate::{chunked, trimmer};
4849

4950
pub(crate) struct Slice2Arrow<'a> {
5051
// pub(crate) file_out: File,
@@ -100,6 +101,11 @@ impl MasterBuilders {
100101
boolean_builder: BooleanBuilder::new(),
101102
runes_in_column: col.length(),
102103
name: col.name().to_string().clone(),
104+
trimmer: trimmer_factory(
105+
DataType::Boolean,
106+
col.alignment(),
107+
col.pad_symbol(),
108+
),
103109
})),
104110
DataType::Float16 => todo!(),
105111
DataType::Float32 => todo!(),
@@ -109,21 +115,37 @@ impl MasterBuilders {
109115
int32builder: Int32Builder::new(),
110116
runes_in_column: col.length(),
111117
name: col.name().to_string().clone(),
118+
trimmer: trimmer_factory(
119+
DataType::Int32,
120+
col.alignment(),
121+
col.pad_symbol(),
122+
),
112123
})),
113124
DataType::Int64 => buildersmut.push(Box::new(HandlerInt64Builder {
114125
int64builder: Int64Builder::new(),
115126
runes_in_column: col.length(),
116127
name: col.name().to_string().clone(),
128+
trimmer: trimmer_factory(
129+
DataType::Int64,
130+
col.alignment(),
131+
col.pad_symbol(),
132+
),
117133
})),
118134
DataType::Utf8 => buildersmut.push(Box::new(HandlerStringBuilder {
119135
string_builder: StringBuilder::new(),
120136
runes_in_column: col.length(),
121137
name: col.name().to_string().clone(),
138+
trimmer: trimmer_factory(DataType::Utf8, col.alignment(), col.pad_symbol()),
122139
})),
123140
DataType::LargeUtf8 => buildersmut.push(Box::new(HandlerStringBuilder {
124141
string_builder: StringBuilder::new(),
125142
runes_in_column: col.length(),
126143
name: col.name().to_string().clone(),
144+
trimmer: trimmer_factory(
145+
DataType::LargeUtf8,
146+
col.alignment(),
147+
col.pad_symbol(),
148+
),
127149
})),
128150
}
129151
}
@@ -217,15 +239,17 @@ struct HandlerInt32Builder {
217239
int32builder: Int32Builder,
218240
runes_in_column: usize,
219241
name: String,
242+
trimmer: Box<dyn ColumnTrimmer + Send + Sync>,
220243
}
221244

222245
impl ColumnBuilder for HandlerInt32Builder {
223246
fn parse_value(&mut self, data: &[u8]) -> usize
224247
where
225248
Self: Sized,
226249
{
227-
let (start, stop) =
228-
chunked::column_length_num_rightaligned(data, self.runes_in_column as i16);
250+
let (start, stop) = self
251+
.trimmer
252+
.find_start_stop(data, self.runes_in_column as i16);
229253

230254
match atoi_simd::parse(&data[start..stop]) {
231255
Ok(n) => {
@@ -249,14 +273,16 @@ struct HandlerInt64Builder {
249273
int64builder: Int64Builder,
250274
runes_in_column: usize,
251275
name: String,
276+
trimmer: Box<dyn ColumnTrimmer + Send + Sync>,
252277
}
253278
impl ColumnBuilder for HandlerInt64Builder {
254279
fn parse_value(&mut self, data: &[u8]) -> usize
255280
where
256281
Self: Sized,
257282
{
258-
let (start, stop) =
259-
chunked::column_length_num_rightaligned(data, self.runes_in_column as i16);
283+
let (start, stop) = self
284+
.trimmer
285+
.find_start_stop(data, self.runes_in_column as i16);
260286
match atoi_simd::parse(&data[start..stop]) {
261287
Ok(n) => {
262288
self.int64builder.append_value(n);
@@ -282,13 +308,15 @@ struct HandlerStringBuilder {
282308
string_builder: StringBuilder,
283309
runes_in_column: usize,
284310
name: String,
311+
trimmer: Box<dyn ColumnTrimmer + Send + Sync>,
285312
}
286313
impl ColumnBuilder for HandlerStringBuilder {
287314
fn parse_value(&mut self, data: &[u8]) -> usize
288315
where
289316
Self: Sized,
290317
{
291-
let column_length: usize = chunked::column_length(data, self.runes_in_column as i16);
318+
let column_length: usize =
319+
trimmer::count_rune_bytelength(data, self.runes_in_column as i16);
292320
// Me dont like ... what is the cost ? Could it be done once for the whole chunk ?
293321
let text: &str = unsafe { from_utf8_unchecked(&data[..column_length]) };
294322

@@ -316,15 +344,17 @@ struct HandlerBooleanBuilder {
316344
boolean_builder: BooleanBuilder,
317345
runes_in_column: usize,
318346
name: String,
347+
trimmer: Box<dyn ColumnTrimmer + Send + Sync>,
319348
}
320349

321350
impl ColumnBuilder for HandlerBooleanBuilder {
322351
fn parse_value(&mut self, data: &[u8]) -> usize
323352
where
324353
Self: Sized,
325354
{
326-
let (start, stop) =
327-
chunked::column_length_char_rightaligned(data, self.runes_in_column as i16);
355+
let (start, stop) = self
356+
.trimmer
357+
.find_start_stop(data, self.runes_in_column as i16);
328358

329359
let text: &str = unsafe { from_utf8_unchecked(&data[start..stop]) };
330360

src/chunked/mod.rs

Lines changed: 3 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,13 @@
2828
use arrow::array::ArrayRef;
2929
use parquet::format;
3030

31-
use std::cmp::min;
3231
use std::fs;
3332

34-
use self::slicer::SLICER_IN_CHUNK_SIZE;
33+
use self::residual_slicer::SLICER_IN_CHUNK_SIZE;
3534

3635
pub(crate) mod arrow_converter;
37-
pub(crate) mod converter;
38-
pub(crate) mod slicer;
36+
pub(crate) mod residual_slicer;
37+
pub(crate) mod self_converter;
3938

4039
pub(crate) struct ChunkAndResidue {
4140
pub(crate) chunk: Box<[u8; SLICER_IN_CHUNK_SIZE]>,
@@ -158,94 +157,3 @@ pub trait ColumnBuilder {
158157
fn finish(&mut self) -> (&str, ArrayRef);
159158
// fn name(& self) -> &String;
160159
}
161-
162-
fn column_length_num_rightaligned(data: &[u8], runes: i16) -> (usize, usize) {
163-
let mut eat = data.iter();
164-
let mut counted_runes = 0;
165-
let mut start: usize = 0;
166-
let stop: usize = min(data.len(), runes as usize);
167-
168-
while counted_runes < runes as usize {
169-
let byten = eat.next();
170-
let bb: u8 = match byten {
171-
None => {
172-
//TODO we ran out of data,this is an error, fix later.
173-
return (start, stop);
174-
}
175-
Some(b) => *b,
176-
};
177-
178-
if let 48..=57 = bb {
179-
return (start, stop);
180-
}
181-
182-
start += 1;
183-
counted_runes += 1;
184-
}
185-
186-
(start, stop)
187-
}
188-
189-
fn column_length_char_rightaligned(data: &[u8], runes: i16) -> (usize, usize) {
190-
let mut eat = data.iter();
191-
let mut counted_runes = 0;
192-
let mut start: usize = 0;
193-
let stop: usize = min(data.len(), runes as usize);
194-
195-
while counted_runes < runes as usize {
196-
let byten = eat.next();
197-
let bb: u8 = match byten {
198-
None => {
199-
//TODO we ran out of data,this is an error, fix later.
200-
return (start, stop);
201-
}
202-
Some(b) => *b,
203-
};
204-
205-
match bb {
206-
101..=132 => return (start, stop),
207-
141..=172 => return (start, stop),
208-
_ => {}
209-
};
210-
start += 1;
211-
counted_runes += 1;
212-
}
213-
214-
(start, stop)
215-
}
216-
217-
fn column_length(data: &[u8], runes: i16) -> usize {
218-
let mut eat = data.iter();
219-
let mut counted_runes = 0;
220-
let mut len: usize = 0;
221-
let mut units = 1;
222-
223-
while counted_runes < runes as usize {
224-
let byten = eat.nth(units - 1);
225-
226-
let bb: u8 = match byten {
227-
None => {
228-
return len;
229-
}
230-
Some(b) => *b,
231-
};
232-
233-
units = match bb {
234-
bb if bb >> 7 == 0 => 1,
235-
bb if bb >> 5 == 0b110 => 2,
236-
bb if bb >> 4 == 0b1110 => 3,
237-
bb if bb >> 3 == 0b11110 => 4,
238-
_bb => {
239-
// TODO BAD ERROR HANDL
240-
panic!("Incorrect UTF-8 sequence");
241-
#[allow(unreachable_code)]
242-
0
243-
}
244-
};
245-
246-
len += units;
247-
counted_runes += 1;
248-
}
249-
250-
len
251-
}
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,11 @@ pub(crate) const SLICER_MAX_RESIDUE_SIZE: usize = SLICER_IN_CHUNK_SIZE;
3939

4040
pub(crate) const IN_MAX_CHUNKS: usize = 2;
4141

42-
pub(crate) struct OldSlicer<'a> {
42+
pub(crate) struct ResidualSlicer<'a> {
4343
pub(crate) fn_find_last_nl: FnFindLastLineBreak<'a>,
4444
}
4545

46-
impl<'a> Slicer<'a> for OldSlicer<'a> {
46+
impl<'a> Slicer<'a> for ResidualSlicer<'a> {
4747
fn slice_and_convert(
4848
&mut self,
4949
mut converter: Box<dyn 'a + Converter<'a>>,

src/cli.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ use std::path::PathBuf;
4242
#[cfg(feature = "rayon")]
4343
use crate::chunked::arrow_converter::{MasterBuilders, Slice2Arrow};
4444
#[cfg(feature = "rayon")]
45-
use crate::chunked::converter::SampleSliceAggregator;
45+
use crate::chunked::residual_slicer::ResidualSlicer;
4646
#[cfg(feature = "rayon")]
47-
use crate::chunked::slicer::OldSlicer;
47+
use crate::chunked::self_converter::SampleSliceAggregator;
4848
#[cfg(feature = "rayon")]
4949
use crate::chunked::{find_last_nl, line_break_len_cr, Converter as ChunkedConverter, Slicer};
5050
use crate::converter::Converter;
@@ -276,7 +276,7 @@ impl Cli {
276276
} => {
277277
let _in_file = fs::File::open(in_file)?;
278278

279-
let mut slicer_instance: Box<dyn Slicer> = Box::new(OldSlicer {
279+
let mut slicer_instance: Box<dyn Slicer> = Box::new(ResidualSlicer {
280280
fn_find_last_nl: find_last_nl,
281281
});
282282

src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ mod parser;
4747
mod schema;
4848
mod slicer;
4949
mod threads;
50+
mod trimmer;
5051
mod writer;
5152

5253
use cli::Cli;

0 commit comments

Comments
 (0)