Skip to content

Commit be65fac

Browse files
Update
1 parent f9ed5b5 commit be65fac

File tree

3 files changed

+44
-31
lines changed

3 files changed

+44
-31
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,16 @@
11
[package]
2-
name = "bitcsv"
2+
name = "simdcsv"
3+
authors = ["Matthew Kim"]
34
version = "0.1.0"
45
edition = "2024"
6+
description = "A CSV parser"
7+
license = "MIT"
58

69
[dependencies]
710
anyhow = "1.0.98"
11+
12+
13+
[profile.release]
14+
panic = "abort"
15+
lto = true
16+
codegen-units = 1

src/reader.rs

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ use crate::grammar::Row;
44
use crate::u8x16::u8x16;
55
use std::ops::Range;
66

7+
/// [`CsvReader`] holds 3 bits per character in the data set.
8+
/// To understand csv, you only need to know whether a byte is a quotation, comma, new line delimiter, or something else.
79
#[derive(Debug)]
810
pub struct CsvReader {
911
quotation_bitsets: Vec<u64>,
@@ -13,8 +15,10 @@ pub struct CsvReader {
1315

1416
impl CsvReader {
1517
pub fn new(data: &[u8]) -> Self {
18+
// todo: can you store non-utf8 encoded characters in csv?
19+
1620
let vectors = CsvClassifier::new(data).classify();
17-
let capacity = vectors.len() / 4 + 1;
21+
let capacity = vectors.len() / 4 + (vectors.len() % 4 != 0) as usize;
1822

1923
let comma_broadcast = u8x16::broadcast(COMMA_CLASS);
2024
let new_line_broadcast = u8x16::broadcast(NEW_LINE_CLASS);
@@ -251,34 +255,34 @@ mod tests {
251255
);
252256
}
253257

254-
#[test]
255-
fn read_taxi_zone_lookup() {
256-
let data = r#"
257-
"LocationID","Borough","Zone","service_zone"
258-
1,"EWR","Newark Airport","EWR"
259-
2,"Queens","Jamaica Bay","Boro Zone"
260-
3,"Bronx","Allerton/Pelham Gardens","Boro Zone""#;
261-
262-
let mut csv = Vec::new();
263-
264-
for row in CsvReader::new(data.as_bytes()).read() {
265-
let fields = row
266-
.fields()
267-
.iter()
268-
.map(|field_range| {
269-
String::from_utf8(data[field_range.clone()].as_bytes().to_vec()).unwrap()
270-
})
271-
.collect::<Vec<_>>();
272-
273-
csv.push(fields);
274-
}
275-
276-
println!("Statistics\ntotal rows: {}", csv.len());
277-
278-
for (i, row) in csv.iter().enumerate() {
279-
println!("row {}\t{}\n", i, row.join("\t"));
280-
}
281-
}
258+
// #[test]
259+
// fn read_taxi_zone_lookup() {
260+
// let data = r#"
261+
// "LocationID","Borough","Zone","service_zone"
262+
// 1,"EWR","Newark Airport","EWR"
263+
// 2,"Queens","Jamaica Bay","Boro Zone"
264+
// 3,"Bronx","Allerton/Pelham Gardens","Boro Zone""#;
265+
266+
// let mut csv = Vec::new();
267+
268+
// for row in CsvReader::new(data.as_bytes()).read() {
269+
// let fields = row
270+
// .fields()
271+
// .iter()
272+
// .map(|field_range| {
273+
// String::from_utf8(data[field_range.clone()].as_bytes().to_vec()).unwrap()
274+
// })
275+
// .collect::<Vec<_>>();
276+
277+
// csv.push(fields);
278+
// }
279+
280+
// println!("Statistics\ntotal rows: {}", csv.len());
281+
282+
// for (i, row) in csv.iter().enumerate() {
283+
// println!("row {}\t{}\n", i, row.join("\t"));
284+
// }
285+
// }
282286

283287
#[test]
284288
fn test_mark_inside_quotations() {

0 commit comments

Comments
 (0)