Skip to content

Commit 5848af3

Browse files
authored
Merge pull request #32 from firelink-data/merge_single_to_main
First merge of Wilhelms branch. Mockery only atm
2 parents 1a216e8 + 7bffc34 commit 5848af3

11 files changed

Lines changed: 1127 additions & 259 deletions

File tree

Cargo.toml

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
[package]
22
name = "evolution"
3-
version = "0.2.2"
3+
version = "0.3.5"
44
edition = "2021"
55
description = "🦖 Evolve your fixed length data files into Apache Arrow tables, fully parallelized!"
66
authors = [
77
"Wilhelm Ågren <wilhelmagren98@gmail.com>",
8+
"Ted Hammarlund <TedHammarlund@gmail.com>",
9+
"Rickard Lundin <rickard@x14.se>",
810
]
911

1012
readme = "README.md"
@@ -28,19 +30,19 @@ arrow-schema = "51.0.0"
2830
arrow-array = "51.0.0"
2931
parquet = "51.0.0"
3032
atoi_simd = "0.15.6"
31-
chrono = "0.4.35"
32-
clap = { version = "4.5.3", features = ["default", "derive"] }
33+
chrono = "0.4.38"
34+
clap = { version = "4.5.4", features = ["default", "derive"] }
3335
crossbeam = "0.8.4"
3436
colored = "2.1.0"
3537
env_logger = "0.11.3"
36-
half = "2.4.0"
38+
half = "2.4.1"
3739
log = "0.4.21"
3840
num_cpus = "1.16.0"
39-
padder = "1.0.0"
40-
rand = "0.8.5"
41-
rayon = "1.9.0"
42-
serde = { version = "1.0.197", features = ["derive"] }
43-
serde_json = "1.0.114"
41+
padder = "1.1.0"
42+
rand = { version = "0.8.5" }
43+
rayon = { version = "1.10.0" }
44+
serde = { version = "1.0.201", features = ["derive"] }
45+
serde_json = "1.0.117"
4446
threadpool = "1.8.1"
4547
substring = "1.4.5"
4648
tempfile = "3.10.1"

src/builder.rs

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
/*
2+
* MIT License
3+
*
4+
* Copyright (c) 2024 Firelink Data
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
*
24+
* File created: 2024-05-07
25+
* Last updated: 2024-05-08
26+
*/
27+
28+
use arrow::array::{ArrayRef, BooleanBuilder};
29+
use log::warn;
30+
31+
use std::fmt::Debug;
32+
use std::sync::Arc;
33+
34+
use crate::parser::BooleanParser;
35+
36+
///
37+
pub(crate) trait ColumnBuilder: Debug {
38+
fn parse_and_push_bytes(&mut self, bytes: &[u8]);
39+
fn runes(&self) -> usize;
40+
fn finish(&mut self) -> (&str, ArrayRef);
41+
}
42+
43+
///
44+
#[derive(Debug)]
45+
pub(crate) struct BooleanColumnBuilder {
46+
inner: BooleanBuilder,
47+
runes: usize,
48+
name: String,
49+
parser: BooleanParser,
50+
}
51+
52+
///
53+
impl BooleanColumnBuilder {
54+
///
55+
pub fn new(runes: usize, name: String, parser: BooleanParser) -> Self {
56+
Self {
57+
inner: BooleanBuilder::new(),
58+
runes,
59+
name,
60+
parser,
61+
}
62+
}
63+
}
64+
65+
///
66+
impl ColumnBuilder for BooleanColumnBuilder {
67+
///
68+
fn parse_and_push_bytes(&mut self, bytes: &[u8]) {
69+
match self.parser.parse(bytes) {
70+
Ok(b) => self.inner.append_value(b),
71+
Err(e) => {
72+
warn!("Could not convert utf-8 text to bool: {:?}", e);
73+
self.inner.append_null();
74+
},
75+
};
76+
}
77+
78+
///
79+
fn runes(&self) -> usize {
80+
self.runes
81+
}
82+
83+
///
84+
fn finish(&mut self) -> (&str, ArrayRef) {
85+
(
86+
&self.name,
87+
Arc::new(self.inner.finish()) as ArrayRef,
88+
)
89+
}
90+
}
91+
92+
///
93+
#[derive(Debug)]
94+
pub(crate) struct Float16Builder {
95+
pub runes: usize,
96+
pub name: String,
97+
}
98+
99+
///
100+
#[derive(Debug)]
101+
pub(crate) struct Float32Builder {
102+
pub runes: usize,
103+
pub name: String,
104+
}
105+
106+
///
107+
#[derive(Debug)]
108+
pub(crate) struct Float64Builder {
109+
pub runes: usize,
110+
pub name: String,
111+
}
112+
113+
///
114+
#[derive(Debug)]
115+
pub(crate) struct Int16Builder {
116+
pub runes: usize,
117+
pub name: String,
118+
}
119+
120+
///
121+
#[derive(Debug)]
122+
pub(crate) struct Int32Builder {
123+
pub runes: usize,
124+
pub name: String,
125+
}
126+
127+
///
128+
#[derive(Debug)]
129+
pub(crate) struct Int64Builder {
130+
pub runes: usize,
131+
pub name: String,
132+
}
133+
134+
///
135+
#[derive(Debug)]
136+
pub(crate) struct StringBuilder {
137+
pub runes: usize,
138+
pub name: String,
139+
}

src/cli.rs

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,12 @@ use crate::slicers::old_slicer::{OldSlicer, IN_MAX_CHUNKS};
3838
use crate::slicers::Slicer;
3939
use crate::slicers::{find_last_nl, line_break_len_cr, ChunkAndResidue};
4040
use crate::{error, mocker, schema};
41-
use clap::{Parser, Subcommand};
41+
use clap::{value_parser, ArgAction, Parser, Subcommand};
4242
use log::info;
4343
use parquet::arrow::ArrowWriter;
44+
use crate::mocker::Mocker;
45+
use crate::error::Result;
46+
use crate::threads::get_available_threads;
4447

4548
#[derive(Parser)]
4649
#[command(author, version, about, long_about = None)]
@@ -77,13 +80,32 @@ enum Commands {
7780
/// does testing things
7881
Mock {
7982
/// Sets schema file
80-
#[arg(short, long, value_name = "SCHEMA")]
83+
#[arg(short = 's', long="schema", value_name = "SCHEMA",required = true)]
8184
schema: PathBuf,
8285
/// Sets input file
83-
#[arg(short, long, value_name = "FILE")]
84-
file: Option<PathBuf>,
85-
#[arg(short, long, value_name = "n-rows", default_value = "100")]
86-
n_rows: Option<i64>,
86+
#[arg(short='o', long="output-file", value_name = "OUTPUT-FILE",required = false)]
87+
output_file: Option<PathBuf>,
88+
#[arg(short='n', long="n-rows", value_name = "NUM-ROWS", default_value = "100",required = false)]
89+
n_rows: Option<usize>,
90+
91+
/// Set the size of the buffer (number of rows).
92+
#[arg(
93+
long = "buffer-size",
94+
value_name = "BUFFER-SIZE",
95+
action = ArgAction::Set,
96+
required = false,
97+
)]
98+
buffer_size: Option<usize>,
99+
100+
/// Set the capacity of the thread channel (number of messages).
101+
#[arg(
102+
long = "thread-channel-capacity",
103+
value_name = "THREAD-CHANNEL-CAPACITY",
104+
action = ArgAction::Set,
105+
required = false,
106+
)]
107+
thread_channel_capacity: Option<usize>,
108+
87109
},
88110
Convert {
89111
/// Sets schema file
@@ -119,7 +141,7 @@ impl Cli {
119141
pub fn run<'a>(
120142
&self,
121143
in_buffers: &mut [ChunkAndResidue; IN_MAX_CHUNKS],
122-
) -> Result<(), error::ExecutionError> {
144+
) -> Result<()> {
123145
let n_logical_threads = num_cpus::get();
124146
let mut n_threads: usize = self.n_threads as usize;
125147

@@ -139,17 +161,23 @@ impl Cli {
139161
match &self.command {
140162
Some(Commands::Mock {
141163
schema,
142-
file,
164+
output_file,
143165
n_rows,
144-
}) => {
145-
print!("target file {:?}", file.as_ref());
146-
147-
mocker::Mocker::new(
148-
schema::FixedSchema::from_path(schema.into()),
149-
file.clone(),
150-
n_threads,
151-
)
152-
.generate(n_rows.unwrap() as usize);
166+
buffer_size,
167+
thread_channel_capacity,
168+
}) => {
169+
print!("target file {:?}", output_file.as_ref());
170+
171+
Mocker::builder()
172+
.schema(schema.to_owned())
173+
.output_file(output_file.to_owned())
174+
.num_rows(*n_rows)
175+
.num_threads(n_threads)
176+
.buffer_size(*buffer_size)
177+
.thread_channel_capacity(*thread_channel_capacity)
178+
.build()?
179+
.generate();
180+
153181
Ok(())
154182
}
155183

src/error.rs

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,56 @@
2222
* SOFTWARE.
2323
*
2424
* File created: 2024-02-05
25-
* Last updated: 2024-02-05
25+
* Last updated: 2024-05-07
2626
*/
2727

28+
use std::error;
29+
use std::fmt;
30+
use std::result;
31+
32+
/// Generic result type which allows for dynamic dispatch of our custom error variants.
33+
pub(crate) type Result<T> = result::Result<T, Box<dyn error::Error>>;
34+
35+
/// Error type used during execution when something goes wrong.
36+
#[derive(Debug)]
37+
pub(crate) struct ExecutionError;
38+
39+
impl error::Error for ExecutionError {}
40+
impl fmt::Display for ExecutionError {
41+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42+
write!(f, "Execution failed, please refer to any logged error messages or the stack-trace for debugging.")
43+
}
44+
}
45+
46+
/// Error type used during setup of either [`Mocker`] or [`Converter`] if any
47+
/// required values are invalid or missing.
2848
#[derive(Debug)]
29-
pub struct ExecutionError {}
49+
pub(crate) struct SetupError;
50+
51+
impl error::Error for SetupError {}
52+
impl fmt::Display for SetupError {
53+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
54+
write!(f, "Setup failed, please refer to any logged error messages or the stack-trace for debugging.")
55+
}
56+
}
57+
58+
#[cfg(test)]
59+
mod tests_error {
60+
use super::*;
61+
62+
#[test]
63+
fn test_execution_error() {
64+
assert_eq!(
65+
"Execution failed, please refer to any logged error messages or the stack-trace for debugging.",
66+
ExecutionError.to_string(),
67+
);
68+
}
69+
70+
#[test]
71+
fn test_setup_error() {
72+
assert_eq!(
73+
"Setup failed, please refer to any logged error messages or the stack-trace for debugging.",
74+
SetupError.to_string(),
75+
);
76+
}
77+
}

src/main.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,15 @@ mod schema;
3636
use crate::slicers::old_slicer::{IN_MAX_CHUNKS, SLICER_IN_CHUNK_SIZE};
3737
use crate::slicers::ChunkAndResidue;
3838
use cli::Cli;
39+
mod threads;
40+
mod writer;
41+
mod builder;
42+
mod parser;
3943

4044
mod converters;
4145
mod dump;
4246
mod slicers;
47+
mod mocking;
4348
///
4449
fn main() {
4550
let cli = Cli::parse();

0 commit comments

Comments
 (0)