Skip to content

Commit 4e4d2c9

Browse files
authored
feat: modular restructure of project as a framework (#71)
Rewrote entire structure, broke out functionality into their own crates, made it so that the core functionality exists in the smaller crates, and people can use those to build whatever CLI/tool they want. This way we are not dependent on just one CLI implementation, you can pick and choose what file types you want to convert and what output targets you want. The crates can be seen as the SDK, whereas the binary is just an example of a CLI which uses the SDK.
2 parents e936b89 + 82319fb commit 4e4d2c9

86 files changed

Lines changed: 4933 additions & 5310 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ jobs:
2727
- name: Install cargo-llvm-cov
2828
uses: taiki-e/install-action@cargo-llvm-cov
2929
- name: Generate coverage report
30-
run: cargo llvm-cov --features rayon --workspace --lcov --output-path lcov.info
30+
run: cargo llvm-cov --features mock --workspace --lcov --output-path lcov.info
3131
- name: Upload coverage report
3232
uses: codecov/codecov-action@v4
3333
with:

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@ Cargo.lock
2424

2525
# Python virtualenv for testing parquet files etc.
2626
.python-venv/
27+
.venv/

Cargo.toml

Lines changed: 30 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,41 @@
1-
[package]
2-
name = "evolution"
3-
version = "1.1.0"
1+
[workspace]
2+
members = [
3+
"examples/*",
4+
"crates/*",
5+
]
6+
resolver = "2"
7+
8+
[workspace.package]
49
edition = "2021"
5-
description = "🦖 Evolve your fixed-length data files into Apache Parquet, fully parallelized!"
10+
homepage = "https://github.com/firelink-data/evolution"
11+
repository = "https://github.com/firelink-data/evolution"
612
authors = [
713
"Ted Hammarlund <TedHammarlund@gmail.com>",
814
"Rickard Lundin <rickard@x14.se>",
915
"Wilhelm Ågren <wilhelmagren98@gmail.com>",
1016
]
11-
17+
license = "MIT"
18+
version = "1.2.0"
1219
readme = "README.md"
13-
license-file = "LICENSE"
14-
homepage = "https://github.com/firelink-data/evolution"
15-
repository = "https://github.com/firelink-data/evolution"
16-
20+
categories = [
21+
"science",
22+
"database",
23+
"concurrency",
24+
"command-line-interface",
25+
]
1726
keywords = [
18-
"cli",
1927
"arrow",
20-
"data-engineering",
21-
"apache-parquet",
22-
"apache-arrow",
28+
"parquet",
2329
]
24-
include = [ "**/*.rs", "Cargo.toml", "LICENSE", "README.md" ]
25-
default-run = "evolution"
26-
27-
[dependencies]
28-
chrono = "0.4.38"
29-
crossbeam = "0.8.4"
30-
colored = "2.0.4"
31-
env_logger = "0.11.3"
32-
half = "2.3.1"
33-
log = "0.4.20"
34-
num_cpus = "1.16.0"
35-
rand = "0.8.5"
36-
serde_json = "1.0.117"
37-
threadpool = "1.8.1"
38-
clap = { version = "4.5.4", features = ["default", "derive"] }
39-
serde = { version = "1.0.202", features = ["derive"] }
40-
rayon = { version = "1.10.0", optional = true }
41-
atoi_simd = { version = "0.16.0", optional = true }
42-
arrow2 = "0.18.0"
43-
libc = "0.2.154"
44-
arrow = "51.0.0"
45-
parquet = "51.0.0"
46-
arrow-ipc = "51.0.0"
47-
deltalake = { version="0.17.3", features = ["datafusion"]}
48-
datafusion = "38.0.0"
49-
50-
deltalake-core = "0.17.3"
51-
52-
padder = { version = "1.2.0", features = ["serde"] }
53-
ordered-channel="1.1.0"
54-
atomic-counter = "1.0.1"
55-
crossbeam-channel = "0.5.13"
56-
tracing = "0.1.40"
57-
tokio = "1.37.0"
58-
futures = "0.3.30"
59-
[dev-dependencies]
60-
glob = "0.3.1"
30+
description = "Efficiently evolve your old fixed-length data files into modern file formats. "
6131

62-
[features]
63-
default = [ "rayon" ]
64-
rayon = [ "dep:rayon", "dep:atoi_simd" ]
65-
nightly = []
32+
[workspace.dependencies]
33+
evolution-builder = { path = "crates/evolution-builder", version = "1.2.0" }
34+
evolution-common = { path = "crates/evolution-common", version = "1.2.0" }
35+
evolution-converter = { path = "crates/evolution-converter", version = "1.2.0" }
36+
evolution-mocker = { path = "crates/evolution-mocker", version = "1.2.0" }
37+
evolution-parser = { path = "crates/evolution-parser", version = "1.2.0" }
38+
evolution-schema = { path = "crates/evolution-schema", version = "1.2.0" }
39+
evolution-slicer = { path = "crates/evolution-slicer", version = "1.2.0" }
40+
evolution-target = { path = "crates/evolution-target", version = "1.2.0" }
41+
evolution-writer = { path = "crates/evolution-writer", version = "1.2.0" }

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2024 Firelink Data
3+
Copyright (c) 2023-2024 Firelink Data
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

README.md

Lines changed: 52 additions & 195 deletions
Large diffs are not rendered by default.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
[package]
2+
name = "evolution-builder"
3+
description = "Builder implementations for evolution."
4+
version = { workspace = true }
5+
edition = { workspace = true }
6+
authors = { workspace = true }
7+
homepage = { workspace = true }
8+
repository = { workspace = true }
9+
license = { workspace = true }
10+
readme = { workspace = true }
11+
12+
[lib]
13+
name = "evolution_builder"
14+
path = "src/lib.rs"
15+
bench = false
16+
17+
[dependencies]
18+
evolution-common = { workspace = true }
19+
evolution-parser = { workspace = true }
20+
arrow = { version = "51.0.0", default-features = false }
21+
log = "0.4.21"
22+
half = "2.4.1"
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
//
2+
// MIT License
3+
//
4+
// Copyright (c) 2023-2024 Firelink Data
5+
//
6+
// Permission is hereby granted, free of charge, to any person obtaining a copy
7+
// of this software and associated documentation files (the "Software"), to deal
8+
// in the Software without restriction, including without limitation the rights
9+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
// copies of the Software, and to permit persons to whom the Software is
11+
// furnished to do so, subject to the following conditions:
12+
//
13+
// The above copyright notice and this permission notice shall be included in all
14+
// copies or substantial portions of the Software.
15+
//
16+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
// SOFTWARE.
23+
//
24+
// File created: 2024-05-07
25+
// Last updated: 2024-10-11
26+
//
27+
28+
use arrow::array::ArrayRef;
29+
use evolution_common::error::Result;
30+
use evolution_common::NUM_BYTES_FOR_NEWLINE;
31+
32+
///
33+
pub trait Builder: From<Vec<ColumnBuilderRef>> {}
34+
35+
///
36+
pub type BuilderRef = Box<dyn Builder>;
37+
38+
///
39+
pub trait ColumnBuilder: Send + Sync {
40+
fn try_build_column(&mut self, bytes: &[u8]) -> Result<usize>;
41+
fn finish(&mut self) -> (&str, ArrayRef);
42+
}
43+
44+
///
45+
pub type ColumnBuilderRef = Box<dyn ColumnBuilder>;
46+
47+
///
48+
pub struct ParquetBuilder {
49+
columns: Vec<ColumnBuilderRef>,
50+
}
51+
52+
impl ParquetBuilder {
53+
///
54+
pub fn try_build_from_slice(&mut self, buffer: &[u8]) -> Result<()> {
55+
let mut idx: usize = 0;
56+
while idx < buffer.len() {
57+
for column in self.columns.iter_mut() {
58+
idx += column.try_build_column(&buffer[idx..])?;
59+
}
60+
idx += NUM_BYTES_FOR_NEWLINE;
61+
}
62+
63+
Ok(())
64+
}
65+
66+
///
67+
pub fn columns(&mut self) -> &mut Vec<ColumnBuilderRef> {
68+
&mut self.columns
69+
}
70+
}
71+
72+
impl From<Vec<ColumnBuilderRef>> for ParquetBuilder {
73+
fn from(columns: Vec<ColumnBuilderRef>) -> Self {
74+
Self { columns }
75+
}
76+
}
77+
78+
impl Builder for ParquetBuilder {}

0 commit comments

Comments
 (0)