@@ -4,6 +4,8 @@ use crate::grammar::Row;
44use crate :: u8x16:: u8x16;
55use std:: ops:: Range ;
66
7+ /// [`CsvReader`] holds 3 bits per character in the data set.
8+ /// To understand csv, you only need to know whether a byte is a quotation, comma, new line delimiter, or something else.
79#[ derive( Debug ) ]
810pub struct CsvReader {
911 quotation_bitsets : Vec < u64 > ,
@@ -13,8 +15,10 @@ pub struct CsvReader {
1315
1416impl CsvReader {
1517 pub fn new ( data : & [ u8 ] ) -> Self {
18+ // todo: can you store non-utf8 encoded characters in csv?
19+
1620 let vectors = CsvClassifier :: new ( data) . classify ( ) ;
17- let capacity = vectors. len ( ) / 4 + 1 ;
21+ let capacity = vectors. len ( ) / 4 + ( vectors . len ( ) % 4 != 0 ) as usize ;
1822
1923 let comma_broadcast = u8x16:: broadcast ( COMMA_CLASS ) ;
2024 let new_line_broadcast = u8x16:: broadcast ( NEW_LINE_CLASS ) ;
@@ -251,34 +255,34 @@ mod tests {
251255 ) ;
252256 }
253257
254- #[ test]
255- fn read_taxi_zone_lookup ( ) {
256- let data = r#"
257- "LocationID","Borough","Zone","service_zone"
258- 1,"EWR","Newark Airport","EWR"
259- 2,"Queens","Jamaica Bay","Boro Zone"
260- 3,"Bronx","Allerton/Pelham Gardens","Boro Zone""# ;
261-
262- let mut csv = Vec :: new ( ) ;
263-
264- for row in CsvReader :: new ( data. as_bytes ( ) ) . read ( ) {
265- let fields = row
266- . fields ( )
267- . iter ( )
268- . map ( |field_range| {
269- String :: from_utf8 ( data[ field_range. clone ( ) ] . as_bytes ( ) . to_vec ( ) ) . unwrap ( )
270- } )
271- . collect :: < Vec < _ > > ( ) ;
272-
273- csv. push ( fields) ;
274- }
275-
276- println ! ( "Statistics\n total rows: {}" , csv. len( ) ) ;
277-
278- for ( i, row) in csv. iter ( ) . enumerate ( ) {
279- println ! ( "row {}\t {}\n " , i, row. join( "\t " ) ) ;
280- }
281- }
258+ // #[test]
259+ // fn read_taxi_zone_lookup() {
260+ // let data = r#"
261+ // "LocationID","Borough","Zone","service_zone"
262+ // 1,"EWR","Newark Airport","EWR"
263+ // 2,"Queens","Jamaica Bay","Boro Zone"
264+ // 3,"Bronx","Allerton/Pelham Gardens","Boro Zone""#;
265+
266+ // let mut csv = Vec::new();
267+
268+ // for row in CsvReader::new(data.as_bytes()).read() {
269+ // let fields = row
270+ // .fields()
271+ // .iter()
272+ // .map(|field_range| {
273+ // String::from_utf8(data[field_range.clone()].as_bytes().to_vec()).unwrap()
274+ // })
275+ // .collect::<Vec<_>>();
276+
277+ // csv.push(fields);
278+ // }
279+
280+ // println!("Statistics\ntotal rows: {}", csv.len());
281+
282+ // for (i, row) in csv.iter().enumerate() {
283+ // println!("row {}\t{}\n", i, row.join("\t"));
284+ // }
285+ // }
282286
283287 #[ test]
284288 fn test_mark_inside_quotations ( ) {
0 commit comments