Skip to content

Commit 572c7b0

Browse files
Init commit
0 parents  commit 572c7b0

File tree

12 files changed

+934
-0
lines changed

12 files changed

+934
-0
lines changed

.github/workflows/tests.yml

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
name: CI
2+
on:
3+
push:
4+
branches:
5+
- main
6+
pull_request:
7+
8+
jobs:
9+
tests:
10+
runs-on: ubuntu-latest-ARM64
11+
steps:
12+
- name: Checkout
13+
uses: actions/checkout@v4
14+
15+
- uses: dtolnay/rust-toolchain@master
16+
with:
17+
toolchain: "nightly"
18+
19+
- name: version info
20+
run: rustc --version; cargo --version;
21+
22+
- name: Run all tests
23+
run: cargo test
24+
25+
fmt:
26+
runs-on: ubuntu-latest-ARM64
27+
steps:
28+
- name: Checkout
29+
uses: actions/checkout@v4
30+
31+
- uses: dtolnay/rust-toolchain@master
32+
with:
33+
toolchain: nightly
34+
35+
- run: |
36+
rustup component add rustfmt
37+
cargo fmt --all -- --check
38+
39+
clippy:
40+
runs-on: ubuntu-latest-ARM64
41+
steps:
42+
- name: Checkout
43+
uses: actions/checkout@v4
44+
45+
- uses: dtolnay/rust-toolchain@master
46+
with:
47+
toolchain: "nightly"
48+
49+
- run: |
50+
rustup component add clippy
51+
cargo clippy -- -D warnings
52+

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/target

Cargo.lock

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[package]
2+
name = "bitcsv"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
anyhow = "1.0.98"

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# csv
2+
3+
There are many kinds of CSV files; this project supports the format described
4+
in [RFC 4180](https://www.rfc-editor.org/rfc/rfc4180.html).
5+
6+
## Reading
7+
8+
https://www.rfc-editor.org/rfc/rfc4180.html<br>
9+
https://arxiv.org/pdf/1902.08318<br>
10+
https://branchfree.org/2019/03/06/code-fragment-finding-quote-pairs-with-carry-less-multiply-pclmulqdq/<br>

src/classifier.rs

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
pub const COMMA_CLASS: u8 = 1;
2+
pub const WHITESPACE_CLASS: u8 = 2;
3+
4+
pub const QUOTATION_CLASS: u8 = 3;
5+
6+
pub const LO_LOOKUP: [u8; 16] = [
7+
0,
8+
0,
9+
QUOTATION_CLASS,
10+
0,
11+
0,
12+
0,
13+
0,
14+
0,
15+
0,
16+
0,
17+
WHITESPACE_CLASS,
18+
0,
19+
COMMA_CLASS,
20+
WHITESPACE_CLASS,
21+
0,
22+
0,
23+
];
24+
pub const HI_LOOKUP: [u8; 16] = [
25+
WHITESPACE_CLASS,
26+
0,
27+
COMMA_CLASS | QUOTATION_CLASS,
28+
0,
29+
0,
30+
0,
31+
0,
32+
0,
33+
0,
34+
0,
35+
0,
36+
0,
37+
0,
38+
0,
39+
0,
40+
0,
41+
];
42+
43+
use crate::u8x16::u8x16;
44+
45+
#[derive(Debug)]
46+
pub(crate) struct CsvClassifier<'a> {
47+
cursor: usize,
48+
data: &'a [u8],
49+
}
50+
51+
impl<'a> CsvClassifier<'a> {
52+
pub fn new(data: &'a [u8]) -> Self {
53+
Self { cursor: 0, data }
54+
}
55+
56+
pub fn classify(&mut self) -> Vec<u8x16> {
57+
let mut bitsets = Vec::new();
58+
59+
let high_nibble_lookup = u8x16::from_slice_unchecked(&HI_LOOKUP);
60+
let low_nibble_lookup = u8x16::from_slice_unchecked(&LO_LOOKUP);
61+
62+
while self.cursor < self.data.len() {
63+
let (lanes, _aligned) = self.load_u8x16();
64+
65+
let (hi_lanes, lo_lanes) = lanes.nibbles();
66+
let res = high_nibble_lookup.classify(hi_lanes) & low_nibble_lookup.classify(lo_lanes);
67+
68+
bitsets.push(res);
69+
}
70+
71+
bitsets
72+
}
73+
74+
fn load_u8x16(&mut self) -> (u8x16, bool) {
75+
if self.cursor + u8x16::LANE_COUNT < self.data.len() {
76+
let slice = &self.data[self.cursor..self.cursor + u8x16::LANE_COUNT];
77+
self.cursor += u8x16::LANE_COUNT;
78+
79+
return (u8x16::from_slice_unchecked(slice), true);
80+
}
81+
82+
let slice = &self.data[self.cursor..];
83+
self.cursor = self.data.len();
84+
85+
let mut temp = [0u8; 16];
86+
temp[..slice.len()].copy_from_slice(slice);
87+
88+
let last = self.data[self.data.len() - 1];
89+
90+
if last != 0x0A && last != 0x0D {
91+
temp[slice.len()] = 0x0A;
92+
}
93+
94+
(u8x16::from_slice_unchecked(&temp), false)
95+
}
96+
}
97+
98+
#[cfg(test)]
99+
mod tests {
100+
use super::*;
101+
102+
#[test]
103+
fn test_basic_classify() {
104+
let mut classifier = CsvClassifier::new(b"a,b,c\nf,\"g\"");
105+
let bitsets = classifier.classify();
106+
107+
assert_eq!(bitsets.len(), 1);
108+
let res: [u8; 16] = bitsets[0].into();
109+
110+
assert_eq!(
111+
res,
112+
[
113+
0,
114+
COMMA_CLASS,
115+
0,
116+
COMMA_CLASS,
117+
0,
118+
WHITESPACE_CLASS,
119+
0,
120+
COMMA_CLASS,
121+
QUOTATION_CLASS,
122+
0,
123+
QUOTATION_CLASS,
124+
WHITESPACE_CLASS, // we always add a \n at the end
125+
0,
126+
0,
127+
0,
128+
0,
129+
]
130+
);
131+
}
132+
}

src/grammar.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
use std::ops::Range;
2+
3+
#[derive(Debug, PartialEq)]
4+
pub struct Row(Vec<Range<usize>>);
5+
6+
impl Row {
7+
pub fn from(fields: Vec<Range<usize>>) -> Self {
8+
Self(fields)
9+
}
10+
11+
pub fn fields(&self) -> &[Range<usize>] {
12+
&self.0
13+
}
14+
}

src/lib.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#![warn(clippy::nursery)]
2+
3+
mod classifier;
4+
pub mod grammar;
5+
pub mod reader;
6+
mod u8x16;

src/main.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
use anyhow::anyhow;
2+
use bitcsv::reader::CsvReader;
3+
4+
fn main() -> anyhow::Result<()> {
5+
let mut args = std::env::args().skip(1);
6+
7+
let data = args.next().ok_or_else(|| anyhow!("No argument passed"))?;
8+
9+
10+
let mut reader = CsvReader::new(data.as_bytes());
11+
let rows = reader.read()?;
12+
13+
println!("{:?}", rows);
14+
15+
Ok(())
16+
}

0 commit comments

Comments
 (0)