Skip to content

Commit 2aef4a5

Browse files
More work
1 parent 572c7b0 commit 2aef4a5

File tree

3 files changed

+59
-33
lines changed

3 files changed

+59
-33
lines changed

src/classifier.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,13 @@ impl<'a> CsvClassifier<'a> {
8484

8585
let mut temp = [0u8; 16];
8686
temp[..slice.len()].copy_from_slice(slice);
87-
87+
8888
let last = self.data[self.data.len() - 1];
89-
89+
9090
if last != 0x0A && last != 0x0D {
91-
temp[slice.len()] = 0x0A;
91+
temp[slice.len()] = 0x0A;
9292
}
93-
93+
9494
(u8x16::from_slice_unchecked(&temp), false)
9595
}
9696
}

src/main.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@ fn main() -> anyhow::Result<()> {
55
let mut args = std::env::args().skip(1);
66

77
let data = args.next().ok_or_else(|| anyhow!("No argument passed"))?;
8-
9-
8+
109
let mut reader = CsvReader::new(data.as_bytes());
1110
let rows = reader.read()?;
12-
11+
1312
println!("{:?}", rows);
1413

1514
Ok(())
16-
}
15+
}

src/reader.rs

Lines changed: 52 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@ impl CsvReader {
4545

4646
let mut current_row = Vec::new();
4747

48-
let cursor = 0;
48+
let mut cursor = 0;
4949

5050
for i in 0..self.quotation_bitsets.len() {
51-
let valid_quotations = remove_escaped_quotations(self.quotation_bitsets[i]);
51+
let valid_quotations = remove_escaped_quotations(self.quotation_bitsets[i]);
5252
let outside_quotations = !mark_inside_quotations(valid_quotations);
5353

5454
let mut valid_commas = self.comma_bitsets[i] & outside_quotations;
@@ -70,12 +70,10 @@ impl CsvReader {
7070
break;
7171
}
7272

73-
current_row.push(
74-
Range {
75-
start: cursor + bitset_cursor,
76-
end: cursor + bitset_cursor + bits_traveled
77-
}
78-
);
73+
current_row.push(Range {
74+
start: cursor + bitset_cursor,
75+
end: cursor + bitset_cursor + bits_traveled,
76+
});
7977

8078
if first_whitespace < first_comma {
8179
rows.push(Row::from(current_row.clone()));
@@ -86,15 +84,17 @@ impl CsvReader {
8684
valid_commas <<= bits_traveled + 1;
8785
valid_whitespace <<= bits_traveled + 1;
8886
}
87+
88+
cursor += bitset_cursor;
8989
}
90-
90+
9191
Ok(rows)
9292
}
9393
}
9494

9595
fn remove_escaped_quotations(q: u64) -> u64 {
9696
let escaped = q & (q << 1);
97-
let escaped = escaped | (escaped>> 1);
97+
let escaped = escaped | (escaped >> 1);
9898

9999
q & !escaped
100100
}
@@ -116,21 +116,50 @@ fn mark_inside_quotations(mut x: u64) -> u64 {
116116

117117
// todo, find a quicker way to do this
118118
#[inline]
119-
fn build_u64(chunk: &[u8x16], broadcast: u8x16) -> u64 {
119+
fn build_u64(chunks: &[u8x16], broadcast: u8x16) -> u64 {
120120
let mut packed: u64 = 0;
121-
for (i, &c) in chunk.iter().enumerate() {
122-
let word = c.eq(broadcast).bitset() as u64;
121+
for (i, &chunk) in chunks.iter().enumerate() {
122+
let word = chunk.eq(broadcast).bitset() as u64;
123123
packed |= word << (48 - i * 16);
124124
}
125-
126125
packed
127126
}
128127

129-
130128
#[cfg(test)]
131129
mod tests {
132130
use super::*;
133131

132+
#[test]
133+
fn test_init() -> Result<()> {
134+
let data = std::fs::read("taxi_zone_lookup.csv")?;
135+
136+
// todo fix alignment
137+
// let csv_reader1 = CsvReader::new(&data[..64]);
138+
139+
let slice63 = &data[..63];
140+
141+
let csv_reader1 = CsvReader::new(&slice63);
142+
143+
assert_eq!(csv_reader1.quotation_bitsets.len(), 1);
144+
assert_eq!(csv_reader1.whitespace_bitsets.len(), 1);
145+
assert_eq!(csv_reader1.comma_bitsets.len(), 1);
146+
147+
let mut longer_slice = vec![];
148+
longer_slice.extend_from_slice(slice63);
149+
longer_slice.push(0x0A);
150+
longer_slice.extend_from_slice(&data[63..87]);
151+
152+
let csv_reader2 = CsvReader::new(&longer_slice);
153+
assert_eq!(csv_reader2.quotation_bitsets.len(), 2);
154+
assert_eq!(csv_reader2.whitespace_bitsets.len(), 2);
155+
assert_eq!(csv_reader2.comma_bitsets.len(), 2);
156+
157+
assert_eq!(csv_reader1.quotation_bitsets[0], csv_reader2.quotation_bitsets[0]);
158+
assert_eq!(csv_reader1.whitespace_bitsets[0], csv_reader2.whitespace_bitsets[0]);
159+
assert_eq!(csv_reader1.comma_bitsets[0], csv_reader2.comma_bitsets[0]);
160+
161+
Ok(())
162+
}
134163
fn check_line(test: &[u8], expected: Vec<Vec<String>>) -> Result<()> {
135164
let mut reader = CsvReader::new(test);
136165
let rows = reader
@@ -232,7 +261,7 @@ mod tests {
232261
#[test]
233262
fn read_taxi_zone_lookup_header() -> Result<()> {
234263
let data = b"\"LocationID\",\"Borough\",\"Zone\",\"service_zone\"\n";
235-
264+
236265
check_line(
237266
data,
238267
vec![vec![
@@ -242,31 +271,29 @@ mod tests {
242271
r#""service_zone""#.to_string(),
243272
]],
244273
)?;
245-
274+
246275
Ok(())
247276
}
248277

249278
// #[test]
250279
// fn read_taxi_zone_lookup() -> Result<()> {
251280
// let data = std::fs::read("taxi_zone_lookup.csv")?;
252-
// assert_eq!(
253-
// b"\"LocationID\",\"Borough\",\"Zone\",\"service_zone\"\r\n",
254-
// &data[..46]
255-
// );
256-
//
281+
//
282+
// // dbg!(String::from_utf8(data[..66].to_vec()).unwrap());
283+
//
257284
// let rows = CsvReader::new(&data).read()?;
258285
// dbg!(&rows);
259-
//
286+
//
260287
// for row in CsvReader::new(&data).read()? {
261288
// let fields = row
262289
// .fields()
263290
// .into_iter()
264291
// .map(|field_range| String::from_utf8(data[field_range.clone()].to_vec()).unwrap())
265292
// .collect::<Vec<_>>();
266-
//
293+
//
267294
// dbg!(fields);
268295
// }
269-
//
296+
//
270297
// Ok(())
271298
// }
272299

0 commit comments

Comments
 (0)