firelink-data
diff --git a/‎.cargo/config.toml‎
Lines changed: 1 addition & 1 deletion b/‎.cargo/config.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitattributes‎
Lines changed: 0 additions & 1 deletion b/‎.gitattributes‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 8 additions & 1 deletion b/‎.gitignore‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎Cargo.toml‎
Lines changed: 29 additions & 28 deletions b/‎Cargo.toml‎
Lines changed: 29 additions & 28 deletions
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 32 additions & 0 deletions b/‎Makefile‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 122 additions & 28 deletions b/‎README.md‎
Lines changed: 122 additions & 28 deletions
@@ -1,2 +1,2 @@
 [build]
-   rustflags = ["-C", "target-cpu=native"]
+rustflags = [ "-C", "target-cpu=native" ]
@@ -27,7 +27,7 @@ jobs:
       - name: Install cargo-llvm-cov
         uses: taiki-e/install-action@cargo-llvm-cov
       - name: Generate coverage report
-        run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
+        run: cargo llvm-cov --features rayon --workspace --lcov --output-path lcov.info
       - name: Upload coverage report
         uses: codecov/codecov-action@v4
         with:
 
@@ -4,7 +4,10 @@ debug/
 target/
 
 # ignore fixed length mock files
-.flf
+*.flf
+
+# ignore parquet files
+*.parquet
 
 # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
@@ -16,4 +19,8 @@ Cargo.lock
 # MSVC Windows builds of rustc generate these, which store debugging information
 *.pdb
 
+# RustRover IDE files
+.idea/
+
+# Python virtualenv for testing parquet files etc.
 .python-venv/
@@ -1,53 +1,54 @@
 [package]
 name = "evolution"
-version = "0.3.5"
+version = "1.0.0"
 edition = "2021"
-description = "🦖 Evolve your fixed length data files into Apache Arrow tables, fully parallelized!"
+description = "🦖 Evolve your fixed-length data files into Apache Arrow tables, fully parallelized!"
 authors = [
-    "Wilhelm Ågren <wilhelmagren98@gmail.com>",
     "Ted Hammarlund <TedHammarlund@gmail.com>",
     "Rickard Lundin <rickard@x14.se>",
+    "Wilhelm Ågren <wilhelmagren98@gmail.com>",
 ]
 
 readme = "README.md"
 license-file = "LICENSE"
 homepage = "https://github.com/firelink-data/evolution"
 repository = "https://github.com/firelink-data/evolution"
 
-keywords = [ "cli", "arrow", "data-engineering", 
-"apache-parquet", "apache-arrow" ]
+keywords = [
+    "cli",
+    "arrow",
+    "data-engineering",
+    "apache-parquet",
+    "apache-arrow",
+]
 include = [ "**/*.rs", "Cargo.toml", "LICENSE", "README.md" ]
 default-run = "evolution"
-# This is nom nom nom for SIMD
-
 
 [dependencies]
-arrow2 = { version = "0.18.0", features = ["io_ipc"] }
-arrow = { version = "51.0.0", features = ["ipc"] }
-debug_print = "1.0.0"
-arrow-format = "0.8.1"
-arrow-schema = "51.0.0"
-arrow-array = "51.0.0"
-parquet = "51.0.0"
-atoi_simd = "0.15.6"
-chrono = "0.4.38"
-clap = { version = "4.5.4", features = ["default", "derive"] }
-crossbeam = "0.8.4"
-colored = "2.1.0"
-env_logger = "0.11.3"
-half = "2.4.1"
-log = "0.4.21"
+chrono = "0.4.31"
+crossbeam = "0.8.2"
+colored = "2.0.4"
+env_logger = "0.11.1"
+half = "2.3.1"
+log = "0.4.20"
 num_cpus = "1.16.0"
-rand = { version = "0.8.5"  }
-rayon = { version = "1.10.0" }
-serde = { version = "1.0.201", features = ["derive"] }
-serde_json = "1.0.117"
+rand = "0.8.5"
+serde_json = "1.0.108"
 threadpool = "1.8.1"
-substring = "1.4.5"
-tempfile = "3.10.1"
+clap = { version = "4.4.8", features = ["default", "derive"] }
+serde = { version = "1.0.193", features = ["derive"] }
+rayon = { version = "1.8.1", optional = true }
+atoi_simd = { version = "0.15.6", optional = true }
+arrow2 = "0.18.0"
 libc = "0.2.154"
+arrow = "51.0.0"
+parquet = "51.0.0"
 padder = { version = "1.2.0", features = ["serde"] }
 
 [dev-dependencies]
 glob = "0.3.1"
 
+[features]
+default = []
+rayon = [ "dep:rayon", "dep:atoi_simd" ]
+nightly = []
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Firelink Data
+Copyright (c) 2024 Firelink Data
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 
@@ -0,0 +1,32 @@
+.DEFAULT_GOAL := debug
+
+all: clean debug
+
+.PHONY: clean
+clean:
+	cargo clean
+
+.PHONY: debug
+debug:
+	cargo build
+
+.PHONY: release
+release:
+	cargo build --release
+
+.PHONY: test
+test:
+	cargo test
+
+.PHONY: check
+check:
+	cargo check
+
+.PHONY: format
+format:
+	cargo fmt --all
+
+.PHONY: lint
+lint:
+	cargo clippy --all-features
+
@@ -14,9 +14,28 @@
 
 </div>
 
+
 ## 🔎 Overview
 
-...
+**Take your old and highly inefficient fixed-length files and evolve them into something more efficient, like Apache Parquet!**
+
+This repository hosts the **evolution** program which both allows you to convert existing fixed-length files into other data formats,
+but also allows you to create large amounts of mocked data blazingly fast. The program supports full parallelism and utilizes SIMD
+techniques, when possible, for highly efficient parsing of data. To get started, follow the installation, setup, and example usage
+sections below in this README.
+
+Happy hacking! 👋🥳
+
+
+## 📋 Table of contents
+https://github.com/firelink-data/evolution/edit/feat/single-threaded/README.md
+* [Installation](https://github.com/firelink-data/evolution#-installation)
+* [Schema setup](https://github.com/firelink-data/evolution#-schema-setup)
+* [Example usage](https://github.com/firelink-data/evolution#-example-usage)
+  * [Converting]()
+  * [Mocking]()
+  * [Threading]()
+* [License](https://github.com/firelink-data/evolution#-license)
 
 
 ## 📦 Installation
@@ -38,6 +57,63 @@ standard library threads and has so far proven a more reliable version, the alte
 for parallel iteration. To use **rayon** instead, build or install the program with the `--features rayon`  flag.
 
 
+## 📝 Schema setup
+
+All available commands in *evolution* require an existing valid **schema**. A schema, in this context, is a [json](https://www.json.org/json-en.html)
+file specifying the layout of the contents of a fixed-length file. Every schema used has to follow 
+[this](https://github.com/firelink-data/evolution/tree/main/resources/template-schema.json) template. If you are unsure whether or not your own schema
+file is valid according to the template, you can use [this](https://www.jsonschemavalidator.net/) validator tool.
+
+An example schema can be found [here](https://github.com/firelink-data/evolution/tree/main/resources/example-schema.json), and looks like this:
+```
+{
+    "name": "EvolutionExampleSchema",
+    "version": 1337,
+    "columns": [
+        {
+            "name": "id",
+            "offset": 0,
+            "length": 9,
+            "dtype": "i32",
+            "alignment": "Right",
+            "pad_symbol": "Zero",
+            "is_nullable": false
+        },
+        {
+            "name": "name",
+            "offset": 9,
+            "length": 32,
+            "dtype": "utf8",
+            "is_nullable": true
+        },
+        {
+            "name": "city",
+            "offset": 41,
+            "length": 32,
+            "dtype": "utf8",
+            "alignment": "Right",
+            "pad_symbol": "Backslash",
+            "is_nullable": false
+        },
+        {
+            "name": "employed",
+            "offset": 73,
+            "length": 5,
+            "dtype": "boolean",
+            "alignment": "Center",
+            "pad_symbol": "Asterisk",
+            "is_nullable": false
+        }
+    ]
+}
+```
+
+As specified in the template, all columns have to provide the following fields **(name, offset, length, is_nullable)**, whereas 
+**alignment** and **pad_symbol** can be omitted (as they are in this example for the *name* column). If they are not provided, they will assume their default values which are
+"**Right**" and "**Whitespace**" respectively. These default values come from the [padder](https://github.com/firelink-data/padder) crate which defines the enums
+`Alignment` and `Symbol`, with default implementations as `Alignment::Right` and `Symbol::Whitespace` respectively.
+
+
 ## 🚀 Example usage
 
 If you build and/or install the program as explained above then by simply running the binary you will see the following:
@@ -57,63 +133,66 @@ Options:
   -V, --version                  Print version
 ```
 
-The functionality of the program is structured as two main commands: **mock** and **convert**.
+As you can see from above, the functionality of the program comprises of the two main commands: **convert** and **mock**.
 
-### 👨‍🎨 Mocking
+
+### 🏗️👷‍♂️ Converting
 
 ```
-Generate mocked fixed-length files (.flf) for testing purposes
+Convert a fixed-length file (.flf) to parquet
 
-Usage: evolution mock [OPTIONS] --schema <SCHEMA>
+Usage: evolution convert [OPTIONS] --file <FILE> --schema <SCHEMA>
 
 Options:
-  -s, --schema <SCHEMA>
-          Specify the .json schema file to mock data for
+  -f, --file <FILE>
+          The fixed-length file to convert
   -o, --output-file <OUTPUT-FILE>
           Specify output (target) file name
-  -n, --n-rows <NUM-ROWS>
-          Set the number of rows to generate [default: 100]
+  -s, --schema <SCHEMA>
+          Specify the .json schema file to use when converting
       --buffer-size <BUFFER-SIZE>
-          Set the size of the buffer (number of rows)
+          Set the size of the buffer (in bytes)
       --thread-channel-capacity <THREAD-CHANNEL-CAPACITY>
           Set the capacity of the thread channel (number of messages)
   -h, --help
           Print help
 ```
 
-For example, if you wanted to mock 1 billion rows of a fixed-length file from a schema located at `./my/path/to/schema.json` with
-the output name `mocked-data.flf`, you could run the following command:
+To convert a fixed-length file called `really-big-data.flf`, with associated schema located at `./my/path/to/schema.json`, to a parquet file with name `smaller-data.parquet`, you could run the following command:
 ```
-evolution mock --schema ./my/schema/path/schema.json --output-file mocked-data.flf --n-rows 1000000000
+evolution convert --file really-big-data.flf --output-file smaller-data.parquet --schema ./my/path/to/schema.json
 ```
 
-### 🏗️👷‍♂️ Converting
+
+### 👨‍🎨 Mocking
 
 ```
-Convert a fixed-length file (.flf) to parquet
+Generate mocked fixed-length files (.flf) for testing purposes
 
-Usage: evolution convert [OPTIONS] --file <FILE> --schema <SCHEMA>
+Usage: evolution mock [OPTIONS] --schema <SCHEMA>
 
 Options:
-  -f, --file <FILE>
-          The fixed-length file to convert
+  -s, --schema <SCHEMA>
+          Specify the .json schema file to mock data for
   -o, --output-file <OUTPUT-FILE>
           Specify output (target) file name
-  -s, --schema <SCHEMA>
-          Specify the .json schema file to use when converting
+  -n, --n-rows <NUM-ROWS>
+          Set the number of rows to generate [default: 100]
       --buffer-size <BUFFER-SIZE>
-          Set the size of the buffer (in bytes)
+          Set the size of the buffer (number of rows)
       --thread-channel-capacity <THREAD-CHANNEL-CAPACITY>
           Set the capacity of the thread channel (number of messages)
   -h, --help
           Print help
 ```
 
-To convert a fixed-length file called `really-big-data.flf`, with associated schema located at `./my/path/to/schema.json`, to a parquet file with name `smaller-data.parquet`, you could run the following command:
+For example, if you wanted to mock 1 billion rows of a fixed-length file from a schema located at `./my/path/to/schema.json` with
+the output name `mocked-data.flf`, you could run the following command:
 ```
-evolution convert --file really-big-data.flf --output-file smaller-data.parquet --schema ./my/path/to/schema.json
+evolution mock --schema ./my/schema/path/schema.json --output-file mocked-data.flf --n-rows 1000000000
 ```
 
+
 ### 🧵 Threading
 
 There exists a global setting for the program called `--n-threads` which dictates whether or not the invoked command will be executed
@@ -123,12 +202,27 @@ logical cores**. If this argument is omitted, then the program will run in singl
 
 **Note that running multithreaded only really has any clear increase in performance for substantially large workloads.**
 
-### 🧵 Converting multithreaded
-An experimental multithreaded implementation exists , it reads chunks of 2 megabytes and splits them into n anmounts of cores in O(1). 
-Run a small conversion test using the "arrow" converter with slicer type "chunked"
+If you are unsure how many logical cores your CPU has, the easiest way to find out is by simply running the program with the
+`--n-threads` option set to a large number. The program will check how many logical cores you have and see whether
+this option exceeds the possible value. If the value you passed is greater than the number of logical cores on your system, then
+the number of logical cores available will be logged to you on stdout.
+
+You could also potentially use one of the commands below depending on your host system.
+
+### Windows
+```
+Get-WmiObject Win32_Processor | Select-Object Name, NumberOfCores, NumberOfLogicalProcessors
 ```
-$ cargo run --package evolution --release --bin evolution -- c-convert --schema resources/schema/test_schema.json --in-file resources/test-flf/test_schema_mock.flf --out-file out.parquet arrow chunks
+
+Use the value found under **NumberOfLogicalProcessors**.
+
+### Unix
 ```
+lscpu | grep -E '^Thread|^Core|^Socket|^CPU\('
+```
+
+The number of logical cores is calculed as: **threads per core X cores per socket X sockets**.
+
 
-## 📋 License
+## 📜 License
 All code is to be held under a general MIT license, please see [LICENSE](https://github.com/firelink-data/evolution/blob/main/LICENSE) for specific information.
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`[build]`
`2`		`- rustflags = ["-C", "target-cpu=native"]`
	`2`	`+rustflags = [ "-C", "target-cpu=native" ]`