Skip to content

Commit c878aed

Browse files
authored
[chore] merge line-based slicer to master (#34)
[chore] merge line-based slicer to master
2 parents 7b81d27 + b6c8d7e commit c878aed

47 files changed

Lines changed: 2120 additions & 62138 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.cargo/config.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
[build]
2-
rustflags = ["-C", "target-cpu=native"]
2+
rustflags = [ "-C", "target-cpu=native" ]

.gitattributes

Lines changed: 0 additions & 1 deletion
This file was deleted.

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ jobs:
2727
- name: Install cargo-llvm-cov
2828
uses: taiki-e/install-action@cargo-llvm-cov
2929
- name: Generate coverage report
30-
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
30+
run: cargo llvm-cov --features rayon --workspace --lcov --output-path lcov.info
3131
- name: Upload coverage report
3232
uses: codecov/codecov-action@v4
3333
with:

.gitignore

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@ debug/
44
target/
55

66
# ignore fixed length mock files
7-
.flf
7+
*.flf
8+
9+
# ignore parquet files
10+
*.parquet
811

912
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
1013
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
@@ -16,4 +19,8 @@ Cargo.lock
1619
# MSVC Windows builds of rustc generate these, which store debugging information
1720
*.pdb
1821

22+
# RustRover IDE files
23+
.idea/
24+
25+
# Python virtualenv for testing parquet files etc.
1926
.python-venv/

Cargo.toml

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,54 @@
11
[package]
22
name = "evolution"
3-
version = "0.3.5"
3+
version = "1.0.0"
44
edition = "2021"
5-
description = "🦖 Evolve your fixed length data files into Apache Arrow tables, fully parallelized!"
5+
description = "🦖 Evolve your fixed-length data files into Apache Arrow tables, fully parallelized!"
66
authors = [
7-
"Wilhelm Ågren <wilhelmagren98@gmail.com>",
87
"Ted Hammarlund <TedHammarlund@gmail.com>",
98
"Rickard Lundin <rickard@x14.se>",
9+
"Wilhelm Ågren <wilhelmagren98@gmail.com>",
1010
]
1111

1212
readme = "README.md"
1313
license-file = "LICENSE"
1414
homepage = "https://github.com/firelink-data/evolution"
1515
repository = "https://github.com/firelink-data/evolution"
1616

17-
keywords = [ "cli", "arrow", "data-engineering",
18-
"apache-parquet", "apache-arrow" ]
17+
keywords = [
18+
"cli",
19+
"arrow",
20+
"data-engineering",
21+
"apache-parquet",
22+
"apache-arrow",
23+
]
1924
include = [ "**/*.rs", "Cargo.toml", "LICENSE", "README.md" ]
2025
default-run = "evolution"
21-
# This is nom nom nom for SIMD
22-
2326

2427
[dependencies]
25-
arrow2 = { version = "0.18.0", features = ["io_ipc"] }
26-
arrow = { version = "51.0.0", features = ["ipc"] }
27-
debug_print = "1.0.0"
28-
arrow-format = "0.8.1"
29-
arrow-schema = "51.0.0"
30-
arrow-array = "51.0.0"
31-
parquet = "51.0.0"
32-
atoi_simd = "0.15.6"
33-
chrono = "0.4.38"
34-
clap = { version = "4.5.4", features = ["default", "derive"] }
35-
crossbeam = "0.8.4"
36-
colored = "2.1.0"
37-
env_logger = "0.11.3"
38-
half = "2.4.1"
39-
log = "0.4.21"
28+
chrono = "0.4.31"
29+
crossbeam = "0.8.2"
30+
colored = "2.0.4"
31+
env_logger = "0.11.1"
32+
half = "2.3.1"
33+
log = "0.4.20"
4034
num_cpus = "1.16.0"
41-
rand = { version = "0.8.5" }
42-
rayon = { version = "1.10.0" }
43-
serde = { version = "1.0.201", features = ["derive"] }
44-
serde_json = "1.0.117"
35+
rand = "0.8.5"
36+
serde_json = "1.0.108"
4537
threadpool = "1.8.1"
46-
substring = "1.4.5"
47-
tempfile = "3.10.1"
38+
clap = { version = "4.4.8", features = ["default", "derive"] }
39+
serde = { version = "1.0.193", features = ["derive"] }
40+
rayon = { version = "1.8.1", optional = true }
41+
atoi_simd = { version = "0.15.6", optional = true }
42+
arrow2 = "0.18.0"
4843
libc = "0.2.154"
44+
arrow = "51.0.0"
45+
parquet = "51.0.0"
4946
padder = { version = "1.2.0", features = ["serde"] }
5047

5148
[dev-dependencies]
5249
glob = "0.3.1"
5350

51+
[features]
52+
default = []
53+
rayon = [ "dep:rayon", "dep:atoi_simd" ]
54+
nightly = []

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2023 Firelink Data
3+
Copyright (c) 2024 Firelink Data
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

Makefile

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
.DEFAULT_GOAL := debug
2+
3+
all: clean debug
4+
5+
.PHONY: clean
6+
clean:
7+
cargo clean
8+
9+
.PHONY: debug
10+
debug:
11+
cargo build
12+
13+
.PHONY: release
14+
release:
15+
cargo build --release
16+
17+
.PHONY: test
18+
test:
19+
cargo test
20+
21+
.PHONY: check
22+
check:
23+
cargo check
24+
25+
.PHONY: format
26+
format:
27+
cargo fmt --all
28+
29+
.PHONY: lint
30+
lint:
31+
cargo clippy --all-features
32+

README.md

Lines changed: 122 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,28 @@
1414

1515
</div>
1616

17+
1718
## 🔎 Overview
1819

19-
...
20+
**Take your old and highly inefficient fixed-length files and evolve them into something more efficient, like Apache Parquet!**
21+
22+
This repository hosts the **evolution** program which both allows you to convert existing fixed-length files into other data formats,
23+
but also allows you to create large amounts of mocked data blazingly fast. The program supports full parallelism and utilizes SIMD
24+
techniques, when possible, for highly efficient parsing of data. To get started, follow the installation, setup, and example usage
25+
sections below in this README.
26+
27+
Happy hacking! 👋🥳
28+
29+
30+
## 📋 Table of contents
31+
https://github.com/firelink-data/evolution/edit/feat/single-threaded/README.md
32+
* [Installation](https://github.com/firelink-data/evolution#-installation)
33+
* [Schema setup](https://github.com/firelink-data/evolution#-schema-setup)
34+
* [Example usage](https://github.com/firelink-data/evolution#-example-usage)
35+
* [Converting]()
36+
* [Mocking]()
37+
* [Threading]()
38+
* [License](https://github.com/firelink-data/evolution#-license)
2039

2140

2241
## 📦 Installation
@@ -38,6 +57,63 @@ standard library threads and has so far proven a more reliable version, the alte
3857
for parallel iteration. To use **rayon** instead, build or install the program with the `--features rayon` flag.
3958

4059

60+
## 📝 Schema setup
61+
62+
All available commands in *evolution* require an existing valid **schema**. A schema, in this context, is a [json](https://www.json.org/json-en.html)
63+
file specifying the layout of the contents of a fixed-length file. Every schema used has to follow
64+
[this](https://github.com/firelink-data/evolution/tree/main/resources/template-schema.json) template. If you are unsure whether or not your own schema
65+
file is valid according to the template, you can use [this](https://www.jsonschemavalidator.net/) validator tool.
66+
67+
An example schema can be found [here](https://github.com/firelink-data/evolution/tree/main/resources/example-schema.json), and looks like this:
68+
```
69+
{
70+
"name": "EvolutionExampleSchema",
71+
"version": 1337,
72+
"columns": [
73+
{
74+
"name": "id",
75+
"offset": 0,
76+
"length": 9,
77+
"dtype": "i32",
78+
"alignment": "Right",
79+
"pad_symbol": "Zero",
80+
"is_nullable": false
81+
},
82+
{
83+
"name": "name",
84+
"offset": 9,
85+
"length": 32,
86+
"dtype": "utf8",
87+
"is_nullable": true
88+
},
89+
{
90+
"name": "city",
91+
"offset": 41,
92+
"length": 32,
93+
"dtype": "utf8",
94+
"alignment": "Right",
95+
"pad_symbol": "Backslash",
96+
"is_nullable": false
97+
},
98+
{
99+
"name": "employed",
100+
"offset": 73,
101+
"length": 5,
102+
"dtype": "boolean",
103+
"alignment": "Center",
104+
"pad_symbol": "Asterisk",
105+
"is_nullable": false
106+
}
107+
]
108+
}
109+
```
110+
111+
As specified in the template, all columns have to provide the following fields **(name, offset, length, is_nullable)**, whereas
112+
**alignment** and **pad_symbol** can be omitted (as they are in this example for the *name* column). If they are not provided, they will assume their default values which are
113+
"**Right**" and "**Whitespace**" respectively. These default values come from the [padder](https://github.com/firelink-data/padder) crate which defines the enums
114+
`Alignment` and `Symbol`, with default implementations as `Alignment::Right` and `Symbol::Whitespace` respectively.
115+
116+
41117
## 🚀 Example usage
42118

43119
If you build and/or install the program as explained above then by simply running the binary you will see the following:
@@ -57,63 +133,66 @@ Options:
57133
-V, --version Print version
58134
```
59135

60-
The functionality of the program is structured as two main commands: **mock** and **convert**.
136+
As you can see from above, the functionality of the program comprises of the two main commands: **convert** and **mock**.
61137

62-
### 👨‍🎨 Mocking
138+
139+
### 🏗️👷‍♂️ Converting
63140

64141
```
65-
Generate mocked fixed-length files (.flf) for testing purposes
142+
Convert a fixed-length file (.flf) to parquet
66143
67-
Usage: evolution mock [OPTIONS] --schema <SCHEMA>
144+
Usage: evolution convert [OPTIONS] --file <FILE> --schema <SCHEMA>
68145
69146
Options:
70-
-s, --schema <SCHEMA>
71-
Specify the .json schema file to mock data for
147+
-f, --file <FILE>
148+
The fixed-length file to convert
72149
-o, --output-file <OUTPUT-FILE>
73150
Specify output (target) file name
74-
-n, --n-rows <NUM-ROWS>
75-
Set the number of rows to generate [default: 100]
151+
-s, --schema <SCHEMA>
152+
Specify the .json schema file to use when converting
76153
--buffer-size <BUFFER-SIZE>
77-
Set the size of the buffer (number of rows)
154+
Set the size of the buffer (in bytes)
78155
--thread-channel-capacity <THREAD-CHANNEL-CAPACITY>
79156
Set the capacity of the thread channel (number of messages)
80157
-h, --help
81158
Print help
82159
```
83160

84-
For example, if you wanted to mock 1 billion rows of a fixed-length file from a schema located at `./my/path/to/schema.json` with
85-
the output name `mocked-data.flf`, you could run the following command:
161+
To convert a fixed-length file called `really-big-data.flf`, with associated schema located at `./my/path/to/schema.json`, to a parquet file with name `smaller-data.parquet`, you could run the following command:
86162
```
87-
evolution mock --schema ./my/schema/path/schema.json --output-file mocked-data.flf --n-rows 1000000000
163+
evolution convert --file really-big-data.flf --output-file smaller-data.parquet --schema ./my/path/to/schema.json
88164
```
89165

90-
### 🏗️👷‍♂️ Converting
166+
167+
### 👨‍🎨 Mocking
91168

92169
```
93-
Convert a fixed-length file (.flf) to parquet
170+
Generate mocked fixed-length files (.flf) for testing purposes
94171
95-
Usage: evolution convert [OPTIONS] --file <FILE> --schema <SCHEMA>
172+
Usage: evolution mock [OPTIONS] --schema <SCHEMA>
96173
97174
Options:
98-
-f, --file <FILE>
99-
The fixed-length file to convert
175+
-s, --schema <SCHEMA>
176+
Specify the .json schema file to mock data for
100177
-o, --output-file <OUTPUT-FILE>
101178
Specify output (target) file name
102-
-s, --schema <SCHEMA>
103-
Specify the .json schema file to use when converting
179+
-n, --n-rows <NUM-ROWS>
180+
Set the number of rows to generate [default: 100]
104181
--buffer-size <BUFFER-SIZE>
105-
Set the size of the buffer (in bytes)
182+
Set the size of the buffer (number of rows)
106183
--thread-channel-capacity <THREAD-CHANNEL-CAPACITY>
107184
Set the capacity of the thread channel (number of messages)
108185
-h, --help
109186
Print help
110187
```
111188

112-
To convert a fixed-length file called `really-big-data.flf`, with associated schema located at `./my/path/to/schema.json`, to a parquet file with name `smaller-data.parquet`, you could run the following command:
189+
For example, if you wanted to mock 1 billion rows of a fixed-length file from a schema located at `./my/path/to/schema.json` with
190+
the output name `mocked-data.flf`, you could run the following command:
113191
```
114-
evolution convert --file really-big-data.flf --output-file smaller-data.parquet --schema ./my/path/to/schema.json
192+
evolution mock --schema ./my/schema/path/schema.json --output-file mocked-data.flf --n-rows 1000000000
115193
```
116194

195+
117196
### 🧵 Threading
118197

119198
There exists a global setting for the program called `--n-threads` which dictates whether or not the invoked command will be executed
@@ -123,12 +202,27 @@ logical cores**. If this argument is omitted, then the program will run in singl
123202

124203
**Note that running multithreaded only really has any clear increase in performance for substantially large workloads.**
125204

126-
### 🧵 Converting multithreaded
127-
An experimental multithreaded implementation exists , it reads chunks of 2 megabytes and splits them into n anmounts of cores in O(1).
128-
Run a small conversion test using the "arrow" converter with slicer type "chunked"
205+
If you are unsure how many logical cores your CPU has, the easiest way to find out is by simply running the program with the
206+
`--n-threads` option set to a large number. The program will check how many logical cores you have and see whether
207+
this option exceeds the possible value. If the value you passed is greater than the number of logical cores on your system, then
208+
the number of logical cores available will be logged to you on stdout.
209+
210+
You could also potentially use one of the commands below depending on your host system.
211+
212+
### Windows
213+
```
214+
Get-WmiObject Win32_Processor | Select-Object Name, NumberOfCores, NumberOfLogicalProcessors
129215
```
130-
$ cargo run --package evolution --release --bin evolution -- c-convert --schema resources/schema/test_schema.json --in-file resources/test-flf/test_schema_mock.flf --out-file out.parquet arrow chunks
216+
217+
Use the value found under **NumberOfLogicalProcessors**.
218+
219+
### Unix
131220
```
221+
lscpu | grep -E '^Thread|^Core|^Socket|^CPU\('
222+
```
223+
224+
The number of logical cores is calculed as: **threads per core X cores per socket X sockets**.
225+
132226

133-
## 📋 License
227+
## 📜 License
134228
All code is to be held under a general MIT license, please see [LICENSE](https://github.com/firelink-data/evolution/blob/main/LICENSE) for specific information.

0 commit comments

Comments
 (0)