diff --git a/.gitattributes b/.gitattributes
index a2ad6cb..cd8c663 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -57,4 +57,4 @@
#*.PDF diff=astextplain
#*.rtf diff=astextplain
#*.RTF diff=astextplain
-**/large-csv-files/** filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
diff --git a/Directory.Build.props b/Directory.Build.props
index b6bbf76..b10b8ad 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -8,14 +8,14 @@
true7.3
- true
+ truetrue
-
+
diff --git a/README.md b/README.md
index ba574db..ea255e9 100644
--- a/README.md
+++ b/README.md
@@ -9,43 +9,10 @@ A fast, [RFC 4180](https://tools.ietf.org/html/rfc4180)-conforming CSV reading l
Documentation is currently being published as [GitHub Pages](https://airbreather.github.io/Cursively/index.html).
## Usage
-1. Create a subclass of `CsvReaderVisitorBase` with your own logic.
-1. To read a CSV file:
- - Create a new instance of your visitor.
- - Create a new instance of `CsvTokenizer`.
- - Call `CsvTokenizer.ProcessNextChunk` for each chunk of the file.
- - Call `CsvTokenizer.ProcessEndOfStream` after the last chunk of the file.
-
-## Example
-This demonstrates using Cursively to write the details of a particular UTF-8 encoded file to the console.
+Create a subclass of `CsvReaderVisitorBase` (or one of its own built-in subclasses) with your own logic for processing the individual elements in order. Then, you have some options.
+### Example Visitor
```csharp
-public static void ProcessCsvFile(string csvFilePath)
-{
- var myVisitor = new MyVisitor(maxFieldLength: 1000);
- var tokenizer = new CsvTokenizer();
- using (var file = File.OpenRead(csvFilePath))
- {
- Console.WriteLine($"Started reading '{csvFilePath}'.");
- Span fileReadBuffer = new byte[4096];
- while (true)
- {
- int count = file.Read(fileReadBuffer);
- if (count == 0)
- {
- break;
- }
-
- var chunk = fileReadBuffer.Slice(0, count);
- tokenizer.ProcessNextChunk(chunk, myVisitor);
- }
-
- tokenizer.ProcessEndOfStream(myVisitor);
- }
-
- Console.WriteLine($"Finished reading '{csvFilePath}'.");
-}
-
public sealed class MyVisitor : CsvReaderVisitorBase
{
private readonly Decoder _utf8Decoder = Encoding.UTF8.GetDecoder();
@@ -69,7 +36,7 @@ public sealed class MyVisitor : CsvReaderVisitorBase
private void VisitFieldContents(ReadOnlySpan chunk, bool flush)
{
int charCount = _utf8Decoder.GetCharCount(chunk, flush);
- if (charCount + _bufferConsumed < _buffer.Length)
+ if (charCount + _bufferConsumed <= _buffer.Length)
{
_utf8Decoder.GetChars(chunk, new Span(_buffer, _bufferConsumed, charCount), flush);
_bufferConsumed += charCount;
@@ -79,14 +46,76 @@ public sealed class MyVisitor : CsvReaderVisitorBase
throw new InvalidDataException($"Field is longer than {_buffer.Length} characters.");
}
- if (!flush)
+ if (flush)
{
- return;
+ Console.Write("Field: ");
+ Console.WriteLine(_buffer, 0, _bufferConsumed);
+ _bufferConsumed = 0;
}
+ }
+}
+```
- Console.Write("Field: ");
- Console.WriteLine(_buffer, 0, _bufferConsumed);
- _bufferConsumed = 0;
+### Fastest
+All of the other methods of processing the data are built on top of this, so it gives you the most control:
+1. Create a new instance of your visitor.
+1. Create a new instance of `CsvTokenizer`.
+1. Call `CsvTokenizer.ProcessNextChunk` for each chunk of the file.
+1. Call `CsvTokenizer.ProcessEndOfStream` after the last chunk of the file.
+
+Example:
+```csharp
+public static void ProcessCsvFile(string csvFilePath)
+{
+ var myVisitor = new MyVisitor(maxFieldLength: 1000);
+ var tokenizer = new CsvTokenizer();
+ using (var file = File.OpenRead(csvFilePath))
+ {
+ Console.WriteLine($"Started reading '{csvFilePath}'.");
+ Span fileReadBuffer = new byte[4096];
+ while (true)
+ {
+ int count = file.Read(fileReadBuffer);
+ if (count == 0)
+ {
+ break;
+ }
+
+ var chunk = fileReadBuffer.Slice(0, count);
+ tokenizer.ProcessNextChunk(chunk, myVisitor);
+ }
+
+ tokenizer.ProcessEndOfStream(myVisitor);
}
+
+ Console.WriteLine($"Finished reading '{csvFilePath}'.");
+}
+```
+
+### Simpler
+1. Create a new instance of your visitor.
+1. Call one of the `Csv.Process*` methods, passing in whatever format your data is in along with your visitor.
+
+Examples:
+```csharp
+public static void ProcessCsvFile(string csvFilePath)
+{
+ Console.WriteLine($"Started reading '{csvFilePath}'.");
+ Csv.ProcessFile(csvFilePath, new MyVisitor(maxFieldLength: 1000));
+ Console.WriteLine($"Finished reading '{csvFilePath}'.");
+}
+
+public static void ProcessCsvStream(Stream csvStream)
+{
+ Console.WriteLine($"Started reading '{csvFilePath}'.");
+ Csv.ProcessStream(csvStream, new MyVisitor(maxFieldLength: 1000));
+ Console.WriteLine($"Finished reading '{csvFilePath}'.");
+}
+
+public static async ValueTask ProcessCsvStreamAsync(Stream csvStream, IProgress progress = null, CancellationToken cancellationToken = default)
+{
+ Console.WriteLine($"Started reading '{csvFilePath}'.");
+ await Csv.ProcessStreamAsync(csvStream, new MyVisitor(maxFieldLength: 1000), progress, cancellationToken);
+ Console.WriteLine($"Finished reading '{csvFilePath}'.");
}
```
diff --git a/doc/benchmark-1.0.0.md b/doc/benchmark-1.0.0.md
new file mode 100644
index 0000000..6ba25b1
--- /dev/null
+++ b/doc/benchmark-1.0.0.md
@@ -0,0 +1,71 @@
+This benchmark tests the simple act of counting how many records are in a CSV file. It's not a simple count of how many lines are in the text file: line breaks within quoted fields must be treated as data, and multiple line breaks in a row must be treated as one, since each record must have at least one field. Therefore, assuming correct implementations, this benchmark should test the raw CSV processing speed.
+
+Cursively eliminates a ton of overhead found in libraries such as CsvHelper by restricting the allowed input encodings and using the visitor pattern as its only means of output. Cursively can scan through the original bytes of the input to do its work, and it can give slices of the input data directly to the consumer without having to copy or allocate.
+
+Therefore, these benchmarks are somewhat biased in favor of Cursively, as CsvHelper relies on external code to transform the data to UTF-16. This isn't as unfair as that makes it sound: the overwhelming majority of input files are probably UTF-8 anyway (or a compatible SBCS), so this transformation is something that practically every user will experience.
+
+- Input files can be found here: https://github.com/airbreather/Cursively/tree/v1.0.0/test/Cursively.Benchmark/large-csv-files
+- Benchmark source code is a slightly edited* version of this: https://github.com/airbreather/Cursively/tree/v1.0.0/test/Cursively.Benchmark
+ - *edited only to remove `CoreRtJob` and the more-or-less redundant `NopUsingCursively`
+
+Raw BenchmarkDotNet output is at the bottom, but here are some numbers derived from it. The data was fully loaded in main memory when running these tests. This summary also does not indicate anything about the GC pressure:
+
+|CSV File|Runtime|Library|Throughput|
+|-|-|-|-|
+|100 records / 10,000 tiny fields each|.NET 4.7.2|Cursively|99.81 MiB/s|
+|100 records / 10,000 tiny fields each|.NET 4.7.2|CsvHelper|22.60 MiB/s|
+|100 records / 10,000 tiny fields each|.NET Core 2.2.5|Cursively|126.1 MiB/s|
+|100 records / 10,000 tiny fields each|.NET Core 2.2.5|CsvHelper|25.32 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET 4.7.2|Cursively|118.5 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET 4.7.2|CsvHelper|25.05 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET Core 2.2.5|Cursively|187.0 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET Core 2.2.5|CsvHelper|27.96 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET 4.7.2|Cursively|64.15 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET 4.7.2|CsvHelper|15.57 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET Core 2.2.5|Cursively|112.7 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET Core 2.2.5|CsvHelper|14.84 MiB/s|
+|Mock data from Mockaroo|.NET 4.7.2|Cursively|1.637 GiB/s|
+|Mock data from Mockaroo|.NET 4.7.2|CsvHelper|74.81 MiB/s|
+|Mock data from Mockaroo|.NET Core 2.2.5|Cursively|1.893 GiB/s|
+|Mock data from Mockaroo|.NET Core 2.2.5|CsvHelper|66.86 MiB/s|
+
+Raw BenchmarkDotNet output:
+
+``` ini
+
+BenchmarkDotNet=v0.11.5, OS=Windows 10.0.17134.765 (1803/April2018Update/Redstone4)
+Intel Core i7-6850K CPU 3.60GHz (Skylake), 1 CPU, 12 logical and 6 physical cores
+Frequency=3515622 Hz, Resolution=284.4447 ns, Timer=TSC
+.NET Core SDK=2.2.300
+ [Host] : .NET Core 2.2.5 (CoreCLR 4.6.27617.05, CoreFX 4.6.27618.01), 64bit RyuJIT
+ Job-ASLTDW : .NET Framework 4.7.2 (CLR 4.0.30319.42000), 64bit RyuJIT-v4.7.3416.0
+ Job-RICADF : .NET Core 2.2.5 (CoreCLR 4.6.27617.05, CoreFX 4.6.27618.01), 64bit RyuJIT
+
+Server=True
+
+```
+| Method | Runtime | csvFile | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
+|------------------------ |-------- |--------------------- |-----------:|----------:|----------:|------:|--------:|-----------:|----------:|---------:|------------:|
+| CountRowsUsingCursively | Clr | 100-huge-records | 27.714 ms | 0.0126 ms | 0.0105 ms | 1.00 | 0.00 | - | - | - | 256 B |
+| CountRowsUsingCsvHelper | Clr | 100-huge-records | 122.397 ms | 0.1685 ms | 0.1494 ms | 4.42 | 0.01 | 17250.0000 | 6250.0000 | 750.0000 | 110257334 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Core | 100-huge-records | 21.932 ms | 0.0254 ms | 0.0226 ms | 1.00 | 0.00 | - | - | - | 56 B |
+| CountRowsUsingCsvHelper | Core | 100-huge-records | 109.261 ms | 0.3319 ms | 0.3104 ms | 4.98 | 0.02 | 400.0000 | 200.0000 | - | 110256320 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Clr | 100-h(...)uoted [23] | 39.453 ms | 0.0974 ms | 0.0864 ms | 1.00 | 0.00 | - | - | - | 683 B |
+| CountRowsUsingCsvHelper | Clr | 100-h(...)uoted [23] | 186.572 ms | 0.4682 ms | 0.4380 ms | 4.73 | 0.01 | 24666.6667 | 9666.6667 | 666.6667 | 153595995 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Core | 100-h(...)uoted [23] | 24.995 ms | 0.0160 ms | 0.0142 ms | 1.00 | 0.00 | - | - | - | 56 B |
+| CountRowsUsingCsvHelper | Core | 100-h(...)uoted [23] | 167.160 ms | 0.3437 ms | 0.3215 ms | 6.69 | 0.02 | 333.3333 | - | - | 153579848 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Clr | 10k-empty-records | 148.952 ms | 0.2502 ms | 0.2340 ms | 1.00 | 0.00 | - | - | - | 2048 B |
+| CountRowsUsingCsvHelper | Clr | 10k-empty-records | 613.718 ms | 0.8869 ms | 0.7862 ms | 4.12 | 0.01 | 66000.0000 | 2000.0000 | - | 420838944 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Core | 10k-empty-records | 84.801 ms | 0.1079 ms | 0.1009 ms | 1.00 | 0.00 | - | - | - | 56 B |
+| CountRowsUsingCsvHelper | Core | 10k-empty-records | 644.051 ms | 2.8782 ms | 2.5515 ms | 7.60 | 0.03 | 2000.0000 | - | - | 420832856 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Clr | mocked | 7.242 ms | 0.0233 ms | 0.0207 ms | 1.00 | 0.00 | - | - | - | 64 B |
+| CountRowsUsingCsvHelper | Clr | mocked | 162.298 ms | 0.2958 ms | 0.2622 ms | 22.41 | 0.08 | 18000.0000 | 333.3333 | - | 115764389 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Core | mocked | 6.264 ms | 0.0115 ms | 0.0107 ms | 1.00 | 0.00 | - | - | - | 56 B |
+| CountRowsUsingCsvHelper | Core | mocked | 181.592 ms | 0.3413 ms | 0.3193 ms | 28.99 | 0.09 | 333.3333 | - | - | 115757736 B |
diff --git a/doc/benchmark-1.1.0.md b/doc/benchmark-1.1.0.md
new file mode 100644
index 0000000..a588ec9
--- /dev/null
+++ b/doc/benchmark-1.1.0.md
@@ -0,0 +1,79 @@
+This benchmark tests the simple act of counting how many records are in a CSV file. It's not a simple count of how many lines are in the text file: line breaks within quoted fields must be treated as data, and multiple line breaks in a row must be treated as one, since each record must have at least one field. Therefore, assuming correct implementations, this benchmark should test the raw CSV processing speed.
+
+Cursively eliminates a ton of overhead found in libraries such as CsvHelper by restricting the allowed input encodings and using the visitor pattern as its only means of output. Cursively can scan through the original bytes of the input to do its work, and it can give slices of the input data directly to the consumer without having to copy or allocate.
+
+Therefore, these benchmarks are somewhat biased in favor of Cursively, as CsvHelper relies on external code to transform the data to UTF-16. This isn't as unfair as that makes it sound: the overwhelming majority of input files are probably UTF-8 anyway (or a compatible SBCS), so this transformation is something that practically every user will experience.
+
+- Input files can be found here: https://github.com/airbreather/Cursively/tree/v1.1.0/test/Cursively.Benchmark/large-csv-files.zip
+- Benchmark source code is this: https://github.com/airbreather/Cursively/tree/v1.1.0/test/Cursively.Benchmark
+
+Raw BenchmarkDotNet output is at the bottom, but here are some numbers derived from it. The data was fully loaded in main memory when running these tests. This summary also does not indicate anything about the GC pressure:
+
+|CSV File|Runtime|Library|Throughput|
+|-|-|-|-|
+|100 records / 10,000 tiny fields each|.NET 4.7.2|Cursively|336.06 MiB/s|
+|100 records / 10,000 tiny fields each|.NET 4.7.2|CsvHelper|22.04 MiB/s|
+|100 records / 10,000 tiny fields each|.NET Core 2.2.5|Cursively|487.59 MiB/s|
+|100 records / 10,000 tiny fields each|.NET Core 2.2.5|CsvHelper|27.31 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET 4.7.2|Cursively|178.23 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET 4.7.2|CsvHelper|24.33 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET Core 2.2.5|Cursively|303.67 MiB/s|
+|100 records / 10,000 tiny quoted fields each|.NET Core 2.2.5|CsvHelper|29.20 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET 4.7.2|Cursively|176.71 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET 4.7.2|CsvHelper|14.45 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET Core 2.2.5|Cursively|306.49 MiB/s|
+|10,000 records / 1,000 empty fields each|.NET Core 2.2.5|CsvHelper|15.15 MiB/s|
+|Mock data from Mockaroo|.NET 4.7.2|Cursively|2,711.41 MiB/s|
+|Mock data from Mockaroo|.NET 4.7.2|CsvHelper|72.50 MiB/s|
+|Mock data from Mockaroo|.NET Core 2.2.5|Cursively|3,755.55 MiB/s|
+|Mock data from Mockaroo|.NET Core 2.2.5|CsvHelper|75.05 MiB/s|
+|worldcitiespop.csv ([from here](https://burntsushi.net/stuff/))|.NET 4.7.2|Cursively|390.75 MiB/s|
+|worldcitiespop.csv ([from here](https://burntsushi.net/stuff/))|.NET 4.7.2|CsvHelper|40.15 MiB/s|
+|worldcitiespop.csv ([from here](https://burntsushi.net/stuff/))|.NET Core 2.2.5|Cursively|607.81 MiB/s|
+|worldcitiespop.csv ([from here](https://burntsushi.net/stuff/))|.NET Core 2.2.5|CsvHelper|39.90 MiB/s|
+
+Raw BenchmarkDotNet output:
+
+``` ini
+
+BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362
+Intel Core i7-6850K CPU 3.60GHz (Skylake), 1 CPU, 12 logical and 6 physical cores
+.NET Core SDK=3.0.100-preview6-012264
+ [Host] : .NET Core 2.2.5 (CoreCLR 4.6.27617.05, CoreFX 4.6.27618.01), 64bit RyuJIT
+ Job-DDQSKN : .NET Framework 4.7.2 (CLR 4.0.30319.42000), 64bit RyuJIT-v4.8.3801.0
+ Job-RTHUVO : .NET Core 2.2.5 (CoreCLR 4.6.27617.05, CoreFX 4.6.27618.01), 64bit RyuJIT
+
+Server=True
+
+```
+| Method | Runtime | csvFile | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
+|------------------------ |-------- |--------------------- |-------------:|----------:|----------:|------:|--------:|------------:|-----------:|---------:|-------------:|
+| CountRowsUsingCursively | Clr | 100-huge-records | 8.231 ms | 0.0839 ms | 0.0743 ms | 1.00 | 0.00 | - | - | - | 128 B |
+| CountRowsUsingCsvHelper | Clr | 100-huge-records | 125.493 ms | 1.1717 ms | 1.0387 ms | 15.25 | 0.21 | 17250.0000 | 6750.0000 | 750.0000 | 110560856 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Core | 100-huge-records | 5.673 ms | 0.0073 ms | 0.0068 ms | 1.00 | 0.00 | - | - | - | 48 B |
+| CountRowsUsingCsvHelper | Core | 100-huge-records | 101.277 ms | 0.2342 ms | 0.2190 ms | 17.85 | 0.05 | 400.0000 | 200.0000 | - | 110256320 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Clr | 100-h(...)uoted [23] | 26.222 ms | 0.0260 ms | 0.0231 ms | 1.00 | 0.00 | - | - | - | 256 B |
+| CountRowsUsingCsvHelper | Clr | 100-h(...)uoted [23] | 192.090 ms | 0.9954 ms | 0.9311 ms | 7.33 | 0.04 | 25000.0000 | 11000.0000 | 666.6667 | 154027456 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Core | 100-h(...)uoted [23] | 15.390 ms | 0.0450 ms | 0.0399 ms | 1.00 | 0.00 | - | - | - | 48 B |
+| CountRowsUsingCsvHelper | Core | 100-h(...)uoted [23] | 160.043 ms | 0.4644 ms | 0.4344 ms | 10.40 | 0.04 | 333.3333 | - | - | 153579848 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Clr | 10k-empty-records | 54.007 ms | 0.3061 ms | 0.2556 ms | 1.00 | 0.00 | - | - | - | 819 B |
+| CountRowsUsingCsvHelper | Clr | 10k-empty-records | 661.502 ms | 3.1801 ms | 2.9747 ms | 12.24 | 0.08 | 66000.0000 | 2000.0000 | - | 422077104 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Core | 10k-empty-records | 31.178 ms | 0.2056 ms | 0.1924 ms | 1.00 | 0.00 | - | - | - | 48 B |
+| CountRowsUsingCsvHelper | Core | 10k-empty-records | 630.683 ms | 1.2503 ms | 1.1084 ms | 20.23 | 0.13 | 2000.0000 | - | - | 420832856 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Clr | mocked | 4.478 ms | 0.0071 ms | 0.0067 ms | 1.00 | 0.00 | - | - | - | 64 B |
+| CountRowsUsingCsvHelper | Clr | mocked | 167.477 ms | 0.3523 ms | 0.3296 ms | 37.40 | 0.08 | 18333.3333 | 333.3333 | - | 116105312 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Core | mocked | 3.233 ms | 0.0063 ms | 0.0059 ms | 1.00 | 0.00 | - | - | - | 48 B |
+| CountRowsUsingCsvHelper | Core | mocked | 161.791 ms | 0.3473 ms | 0.3249 ms | 50.05 | 0.15 | 333.3333 | - | - | 115757736 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Clr | worldcitiespop | 369.738 ms | 0.6855 ms | 0.6077 ms | 1.00 | 0.00 | - | - | - | 8192 B |
+| CountRowsUsingCsvHelper | Clr | worldcitiespop | 3,598.421 ms | 2.0735 ms | 1.9396 ms | 9.73 | 0.02 | 493000.0000 | 7000.0000 | - | 3105811440 B |
+| | | | | | | | | | | | |
+| CountRowsUsingCursively | Core | worldcitiespop | 237.695 ms | 0.2994 ms | 0.2800 ms | 1.00 | 0.00 | - | - | - | 48 B |
+| CountRowsUsingCsvHelper | Core | worldcitiespop | 3,620.550 ms | 3.1766 ms | 2.8160 ms | 15.23 | 0.02 | 15000.0000 | - | - | 3096694312 B |
diff --git a/doc/release-notes.md b/doc/release-notes.md
index 8c1dc2d..1096e59 100644
--- a/doc/release-notes.md
+++ b/doc/release-notes.md
@@ -1,4 +1,12 @@
# Cursively Release Notes
+## [1.1.0](https://github.com/airbreather/Cursively/milestone/1)
+- Several further performance optimizations. Most significantly, inlining and tuning a critical `ReadOnlySpan` extension method.
+ - In some cases, this increased throughput by a factor of 3.
+- Added hooks for visitor implementations to detect situations where the stream does not conform to the RFC 4180 rules for quoted fields ([#4](https://github.com/airbreather/Cursively/issues/4))
+- Added support to customize the field delimiter byte ([#11](https://github.com/airbreather/Cursively/issues/11))
+- Added helpers to avoid having to use `CsvTokenizer` directly in most cases ([#9](https://github.com/airbreather/Cursively/issues/9), [#10](https://github.com/airbreather/Cursively/issues/10))
+- Added an intermediate abstract visitor class that handles UTF-8 encoded headers ([#5](https://github.com/airbreather/Cursively/issues/5))
+
## 1.0.0
- Initial release.
diff --git a/doc/toc.yml b/doc/toc.yml
index aa4ba7c..c94d485 100644
--- a/doc/toc.yml
+++ b/doc/toc.yml
@@ -2,6 +2,8 @@
href: index.md
- name: API Documentation
href: obj/api/
+- name: Benchmark
+ href: benchmark-1.1.0.md
- name: Release Notes
href: release-notes.md
- name: NuGet Package
diff --git a/lgtm.yml b/lgtm.yml
new file mode 100644
index 0000000..0c3c07c
--- /dev/null
+++ b/lgtm.yml
@@ -0,0 +1,6 @@
+extraction:
+ csharp:
+ after_prepare:
+ - export LGTM=true
+ index:
+ solution: src/Cursively/Cursively.csproj
diff --git a/src/Cursively/Csv.cs b/src/Cursively/Csv.cs
new file mode 100644
index 0000000..eca7491
--- /dev/null
+++ b/src/Cursively/Csv.cs
@@ -0,0 +1,319 @@
+using System;
+using System.IO;
+using System.IO.MemoryMappedFiles;
+using System.Runtime.CompilerServices;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace Cursively
+{
+ ///
+ /// Contains helper methods for CSV processing.
+ ///
+ public static class Csv
+ {
+ ///
+ /// Describes the contents of a CSV stream to the given instance of the
+ /// class.
+ ///
+ ///
+ /// The CSV stream to describe.
+ ///
+ ///
+ /// The instance to describe the stream to.
+ ///
+ ///
+ /// Thrown when is .
+ ///
+ public static void ProcessStream(Stream csvStream, CsvReaderVisitorBase visitor) =>
+ ProcessStream(csvStream, visitor, 81920);
+
+ ///
+ /// Describes the contents of a CSV stream to the given instance of the
+ /// class.
+ ///
+ ///
+ /// The CSV stream to describe.
+ ///
+ ///
+ /// The instance to describe the stream to.
+ ///
+ ///
+ /// The length of the buffer to use (default: 81920).
+ ///
+ ///
+ /// Thrown when is .
+ ///
+ ///
+ /// Thrown when is not greater than zero.
+ ///
+ ///
+ /// Thrown when does not support reading (i.e.,
+ /// is ).
+ ///
+ public static void ProcessStream(Stream csvStream, CsvReaderVisitorBase visitor, int bufferSize)
+ {
+ if (csvStream is null)
+ {
+ throw new ArgumentNullException(nameof(csvStream));
+ }
+
+ if (bufferSize <= 0)
+ {
+ throw new ArgumentOutOfRangeException(nameof(bufferSize), bufferSize, "Must be greater than zero.");
+ }
+
+ if (!csvStream.CanRead)
+ {
+ throw new ArgumentException("Stream does not support reading.", nameof(csvStream));
+ }
+
+ byte[] buffer = new byte[bufferSize];
+ var tokenizer = new CsvTokenizer();
+ int cnt;
+ while ((cnt = csvStream.Read(buffer, 0, buffer.Length)) != 0)
+ {
+ tokenizer.ProcessNextChunk(new ReadOnlySpan(buffer, 0, cnt), visitor);
+ }
+
+ tokenizer.ProcessEndOfStream(visitor);
+ }
+
+ ///
+ /// Describes the contents of a CSV stream to the given instance of the
+ /// class.
+ ///
+ ///
+ /// The CSV stream to describe.
+ ///
+ ///
+ /// The instance to describe the stream to.
+ ///
+ ///
+ ///
+ /// An that will be notified every time the next chunk of the
+ /// stream is processed, with the size of the chunk (in bytes) that was processed.
+ ///
+ ///
+ /// All notifications will receive values less than or equal to the buffer size in bytes
+ /// (which, for this overload, is the default value of 81,920).
+ ///
+ ///
+ /// There will be one last notification with value 0 after the entire stream has been
+ /// processed and the final few stream elements have been consumed.
+ ///
+ ///
+ /// This may be left as if no progress notifications are needed.
+ ///
+ ///
+ ///
+ ///
+ /// An instance of that may be used to signal that results
+ /// are no longer needed, and so the method should terminate at its earliest convenience.
+ ///
+ ///
+ /// This may be left as its default value of if the
+ /// operation does not need to support cancellation.
+ ///
+ ///
+ ///
+ /// Thrown when is .
+ ///
+ ///
+ /// Thrown when does not support reading (i.e.,
+ /// is ).
+ ///
+ ///
+ /// Thrown (perhaps asynchronously) to acknowledge cancellation. A derived exception, such
+ /// as , may also be thrown by the system.
+ ///
+ ///
+ /// Thrown (perhaps asynchronously) if the underlying
+ /// object backing is disposed before the asynchronous
+ /// operation terminates.
+ ///
+ public static ValueTask ProcessStreamAsync(Stream csvStream, CsvReaderVisitorBase visitor, IProgress progress = null, CancellationToken cancellationToken = default) =>
+ ProcessStreamAsync(csvStream, visitor, 81920, progress, cancellationToken);
+
+ ///
+ /// Describes the contents of a CSV stream to the given instance of the
+ /// class.
+ ///
+ ///
+ /// The CSV stream to describe.
+ ///
+ ///
+ /// The instance to describe the stream to.
+ ///
+ ///
+ /// The length of the buffer to use (default: 81920).
+ ///
+ ///
+ ///
+ /// An that will be notified every time the next chunk of the
+ /// stream is processed, with the size of the chunk (in bytes) that was processed.
+ ///
+ ///
+ /// All notifications will receive values less than or equal to the buffer size in bytes
+ /// (which, for this overload, is the value of ).
+ ///
+ ///
+ /// There will be one last notification with value 0 after the entire stream has been
+ /// processed and the final few stream elements have been consumed.
+ ///
+ ///
+ /// This may be left as if no progress notifications are needed.
+ ///
+ ///
+ ///
+ ///
+ /// An instance of that may be used to signal that results
+ /// are no longer needed, and so the method should terminate at its earliest convenience.
+ ///
+ ///
+ /// This may be left as its default value of if the
+ /// operation does not need to support cancellation.
+ ///
+ ///
+ ///
+ /// Thrown when is .
+ ///
+ ///
+ /// Thrown when is not greater than zero.
+ ///
+ ///
+ /// Thrown when does not support reading (i.e.,
+ /// is ).
+ ///
+ ///
+ /// Thrown (perhaps asynchronously) to acknowledge cancellation. A derived exception, such
+ /// as , may also be thrown by the system.
+ ///
+ ///
+ /// Thrown (perhaps asynchronously) if the underlying
+ /// object backing is disposed before the asynchronous
+ /// operation terminates.
+ ///
+ public static async ValueTask ProcessStreamAsync(Stream csvStream, CsvReaderVisitorBase visitor, int bufferSize, IProgress progress = null, CancellationToken cancellationToken = default)
+ {
+ if (csvStream is null)
+ {
+ throw new ArgumentNullException(nameof(csvStream));
+ }
+
+ if (bufferSize <= 0)
+ {
+ throw new ArgumentOutOfRangeException(nameof(bufferSize), bufferSize, "Must be greater than zero.");
+ }
+
+ if (!csvStream.CanRead)
+ {
+ throw new ArgumentException("Stream does not support reading.", nameof(csvStream));
+ }
+
+ byte[] buffer = new byte[bufferSize];
+ var tokenizer = new CsvTokenizer();
+ int cnt;
+ while ((cnt = await csvStream.ReadAsync(buffer, 0, buffer.Length, cancellationToken).ConfigureAwait(false)) != 0)
+ {
+ tokenizer.ProcessNextChunk(new ReadOnlySpan(buffer, 0, cnt), visitor);
+ progress?.Report(cnt);
+
+ // not all streams support cancellation, so we might as well do this ourselves. it
+ // does involve a volatile read, so don't go overboard.
+ cancellationToken.ThrowIfCancellationRequested();
+ }
+
+ tokenizer.ProcessEndOfStream(visitor);
+ progress?.Report(0);
+ }
+
+ ///
+ /// Describes the entire contents of a CSV file to the given instance of the
+ /// class.
+ ///
+ ///
+ /// The path to the CSV file to describe.
+ ///
+ ///
+ /// The instance to describe the file to.
+ ///
+ ///
+ /// The current version of this method uses memory-mapping behind the scenes in order to
+ /// minimize the overhead of copying and cutting across discrete buffers, at the expense of
+ /// slightly more overhead to set up the memory map than a typical read-from-stream pattern.
+ ///
+ ///
+ /// See .
+ ///
+ ///
+ /// See .
+ ///
+ ///
+ /// See .
+ ///
+ ///
+ /// See .
+ ///
+ ///
+ ///
+ /// See .
+ ///
+ ///
+ /// See .
+ ///
+ ///
+ ///
+ /// See .
+ ///
+ ///
+ /// See .
+ ///
+ ///
+ /// See .
+ ///
+ ///
+ /// See .
+ ///
+ public static unsafe void ProcessFile(string csvFilePath, CsvReaderVisitorBase visitor)
+ {
+ using (var fl = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan))
+ {
+ long length = fl.Length;
+ if (length == 0)
+ {
+ return;
+ }
+
+ var tokenizer = new CsvTokenizer();
+ using (var memoryMappedFile = MemoryMappedFile.CreateFromFile(fl, null, 0, MemoryMappedFileAccess.Read, HandleInheritability.None, leaveOpen: true))
+ using (var accessor = memoryMappedFile.CreateViewAccessor(0, 0, MemoryMappedFileAccess.Read))
+ {
+ var handle = accessor.SafeMemoryMappedViewHandle;
+ byte* ptr = null;
+ RuntimeHelpers.PrepareConstrainedRegions();
+ try
+ {
+ handle.AcquirePointer(ref ptr);
+ while (length > int.MaxValue)
+ {
+ tokenizer.ProcessNextChunk(new ReadOnlySpan(ptr, int.MaxValue), visitor);
+ length -= int.MaxValue;
+ ptr += int.MaxValue;
+ }
+
+ tokenizer.ProcessNextChunk(new ReadOnlySpan(ptr, unchecked((int)length)), visitor);
+ tokenizer.ProcessEndOfStream(visitor);
+ }
+ finally
+ {
+ if (ptr != null)
+ {
+ handle.ReleasePointer();
+ }
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/src/Cursively/CsvReaderVisitorBase.cs b/src/Cursively/CsvReaderVisitorBase.cs
index 5b0eea6..d2a5410 100644
--- a/src/Cursively/CsvReaderVisitorBase.cs
+++ b/src/Cursively/CsvReaderVisitorBase.cs
@@ -29,8 +29,9 @@ public abstract class CsvReaderVisitorBase
/// This method may be called at any time.
///
///
- /// Only and may be
- /// called directly after a call to this method.
+ /// Only , , and
+ /// may be called directly after a call to this
+ /// method.
///
///
/// There are multiple reasons why this method may be called instead of going straight to
@@ -70,7 +71,8 @@ public abstract class CsvReaderVisitorBase
/// This method may be called at any time.
///
///
- /// Any method, including this one, may be called directly after a call to this method.
+ /// Any method except , including this one, may be
+ /// called directly after a call to this method.
///
///
/// This method may be called without a preceding
@@ -94,6 +96,42 @@ public abstract class CsvReaderVisitorBase
///
public abstract void VisitEndOfRecord();
+ ///
+ ///
+ /// Notifies that the current field contains double-quote characters that do not comply with
+ /// RFC 4180, and so it is being processed according to this library's extra rules.
+ ///
+ ///
+ /// The default behavior of this method is to do nothing. Subclasses may wish to override
+ /// to add warnings / errors when processing streams that do not follow RFC 4180 and are
+ /// therefore in danger of being processed differently than other tools.
+ ///
+ ///
+ ///
+ ///
+ /// This method may only be called as the very next method that gets called after a call to
+ /// , and only at most once per field (i.e., once it
+ /// is called, it may not be called again until a call brings
+ /// the tokenizer back to a state where RFC 4180 rules are expected).
+ ///
+ ///
+ /// Only and may be
+ /// called directly after a call to this method.
+ ///
+ ///
+ /// The last byte in the preceding call's chunk will
+ /// be the specific byte that was unexpected; all bytes before it were legal under RFC 4180.
+ /// So if this event is being raised because the tokenizer found a double-quote in a field
+ /// that did not start with a double-quote, then was
+ /// previously called with a chunk that ended with that double-quote. If it's being raised
+ /// because a double-quote was found in a quoted field that was not immediately followed by
+ /// a double-quote, delimiter, or line ending, then
+ /// was previously called with a chunk that ended with whichever byte immediately followed
+ /// the double-quote that ended the quoted part of the quoted field data.
+ ///
+ ///
+ public virtual void VisitNonstandardQuotedField() { }
+
private sealed class NullVisitor : CsvReaderVisitorBase
{
public override void VisitEndOfRecord() { }
diff --git a/src/Cursively/CsvReaderVisitorWithUTF8HeadersBase.cs b/src/Cursively/CsvReaderVisitorWithUTF8HeadersBase.cs
new file mode 100644
index 0000000..dae9ca4
--- /dev/null
+++ b/src/Cursively/CsvReaderVisitorWithUTF8HeadersBase.cs
@@ -0,0 +1,471 @@
+using System;
+using System.Collections.Immutable;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Text;
+
+namespace Cursively
+{
+ ///
+ ///
+ /// Intermediate base class for CSV reader visitors that don't want to have to implement header
+ /// handling by themselves.
+ ///
+ ///
+ /// Instances of this class are tied to a single CSV stream and cannot be reused or reset for
+ /// use with other CSV streams.
+ ///
+ ///
+ /// Each instance of this visitor has an upper-bound on the maximum number of headers and on the
+ /// maximum length of each header. CSV streams that exceed these limits will cause this class
+ /// to throw exceptions, and behavior of a particular instance is undefined once this happens.
+ ///
+ ///
+ ///
+ ///
+ /// The following input-dependent exceptions may get thrown when using this visitor, all of
+ /// which inherit from :
+ ///
+ ///
+ ///
+ ///
+ /// if is
+ /// being used and the CSV stream contains a sequence of invalid UTF-8 bytes.
+ ///
+ ///
+ ///
+ ///
+ /// if the CSV stream contains one or more
+ /// headers that are longer than the configured maximum.
+ ///
+ ///
+ ///
+ ///
+ /// if the CSV stream contains more headers than
+ /// the configured maximum.
+ ///
+ ///
+ ///
+ ///
+ /// , by default, if a data record contains more
+ /// fields than the header record.
+ ///
+ ///
+ ///
+ ///
+ /// , by default, if a data record contains more
+ /// fields than the header record.
+ ///
+ ///
+ ///
+ ///
+ public abstract class CsvReaderVisitorWithUTF8HeadersBase : CsvReaderVisitorBase
+ {
+ ///
+ /// The value used by to initialize the
+ /// maximum number of headers (1,000).
+ ///
+ protected static readonly int DefaultMaxHeaderCount = 1_000;
+
+ ///
+ /// The value used by to initialize the
+ /// maximum length, in UTF-16 code units, of a single header (100).
+ ///
+ protected static readonly int DefaultMaxHeaderLength = 100;
+
+ ///
+ /// The value used by to initialize the
+ /// value indicating whether or not to ignore a leading UTF-8 BOM (true).
+ ///
+ protected static readonly bool DefaultIgnoreUTF8IdentifierOnFirstHeaderField = true;
+
+ ///
+ /// The value used by to initialize the
+ /// fallback logic when the decoder encounters invalid UTF-8 bytes (throw an exception).
+ ///
+ protected static readonly DecoderFallback DefaultDecoderFallback = new CursivelyDecoderExceptionFallback();
+
+ private static readonly UTF8Encoding EncodingToUse = new UTF8Encoding(false, false);
+
+ private readonly Decoder _headerDecoder;
+
+ private readonly ImmutableArray.Builder _headersBuilder;
+
+ private readonly bool _ignoreUTF8IdentifierOnFirstHeaderField;
+
+ private char[] _headerBuffer;
+
+ private ImmutableArray _headers;
+
+ private int _headerBufferConsumed;
+
+ private int _currentFieldIndex = -1;
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ protected CsvReaderVisitorWithUTF8HeadersBase()
+ : this(maxHeaderCount: DefaultMaxHeaderCount,
+ maxHeaderLength: DefaultMaxHeaderLength,
+ ignoreUTF8IdentifierOnFirstHeaderField: DefaultIgnoreUTF8IdentifierOnFirstHeaderField,
+ decoderFallback: DefaultDecoderFallback)
+ {
+ }
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ ///
+ /// The maximum number of headers to allow.
+ /// Default: .
+ ///
+ ///
+ /// The maximum length, in UTF-16 code units, of any particular header.
+ /// Default: .
+ ///
+ ///
+ /// A value indicating whether or not to ignore a leading UTF-8 BOM.
+ /// Default: .
+ ///
+ ///
+ /// The fallback logic used when the decoder encounters invalid UTF-8 bytes.
+ /// Default: .
+ ///
+ ///
+ /// Thrown when is .
+ ///
+ ///
+ /// Thrown when or is
+ /// less than 1.
+ ///
+ protected CsvReaderVisitorWithUTF8HeadersBase(int maxHeaderCount, int maxHeaderLength, bool ignoreUTF8IdentifierOnFirstHeaderField, DecoderFallback decoderFallback)
+ {
+ if (maxHeaderCount < 1)
+ {
+ throw new ArgumentOutOfRangeException(nameof(maxHeaderCount), maxHeaderCount, "Must be greater than zero.");
+ }
+
+ if (maxHeaderLength < 1)
+ {
+ throw new ArgumentOutOfRangeException(nameof(maxHeaderLength), maxHeaderLength, "Must be greater than zero.");
+ }
+
+ if (decoderFallback is null)
+ {
+ throw new ArgumentNullException(nameof(decoderFallback));
+ }
+
+ _ignoreUTF8IdentifierOnFirstHeaderField = ignoreUTF8IdentifierOnFirstHeaderField;
+
+ _headersBuilder = ImmutableArray.CreateBuilder(maxHeaderCount);
+
+ _headerBuffer = new char[maxHeaderLength];
+
+ _headerDecoder = EncodingToUse.GetDecoder();
+ _headerDecoder.Fallback = decoderFallback;
+ }
+
+ ///
+ ///
+ /// Gets the headers of the CSV stream.
+ ///
+ ///
+ /// Only valid after has been called.
+ ///
+ ///
+ ///
+ /// Thrown when trying to access this value before has
+ /// been called.
+ ///
+ ///
+ /// Once initialized, the value will remain the same for as long as this object instance
+ /// stays alive.
+ ///
+ protected ImmutableArray Headers
+ {
+ get
+ {
+ if (_headers.IsDefault)
+ {
+ ThrowExceptionWhenHeadersAreStillBeingBuilt();
+ }
+
+ return _headers;
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void ThrowExceptionWhenHeadersAreStillBeingBuilt() =>
+ throw new InvalidOperationException("Headers are still being built.");
+
+ ///
+ /// Gets the zero-based index of the field that is currently being read. The value should
+ /// be the length of during and
+ /// , except after or
+ /// has been called.
+ ///
+ protected int CurrentFieldIndex => _currentFieldIndex;
+
+ ///
+ public sealed override void VisitPartialFieldContents(ReadOnlySpan chunk)
+ {
+ if (_headers.IsDefault || _currentFieldIndex >= _headers.Length)
+ {
+ VisitPartialFieldContentsSlow(chunk);
+ }
+ else
+ {
+ VisitPartialDataFieldContents(chunk);
+ }
+ }
+
+ ///
+ public sealed override void VisitEndOfField(ReadOnlySpan chunk)
+ {
+ if (_headers.IsDefault || _currentFieldIndex >= _headers.Length)
+ {
+ VisitEndOfFieldSlow(chunk);
+ }
+ else
+ {
+ VisitEndOfDataField(chunk);
+ ++_currentFieldIndex;
+ }
+ }
+
+ ///
+ public sealed override void VisitEndOfRecord()
+ {
+ if (_headers.IsDefault || _currentFieldIndex != _headers.Length)
+ {
+ VisitEndOfRecordSlow();
+ }
+ else
+ {
+ VisitEndOfDataRecord();
+ _currentFieldIndex = 0;
+ }
+ }
+
+ ///
+ ///
+ /// Notifies that all headers have been read and is safe to read.
+ ///
+ ///
+ /// The default behavior is to do nothing.
+ ///
+ ///
+ protected virtual void VisitEndOfHeaderRecord() { }
+
+ ///
+ /// Visits part of a non-header field's data.
+ ///
+ ///
+ /// The data from this part of the field.
+ ///
+ ///
+ /// See documentation for for
+ /// details about when and how this method will be called.
+ ///
+ protected abstract void VisitPartialDataFieldContents(ReadOnlySpan chunk);
+
+ ///
+ /// Visits the last part of a non-header field's data.
+ ///
+ ///
+ /// The data from the last part of the field.
+ ///
+ ///
+ /// See documentation for for
+ /// details about when and how this method will be called.
+ ///
+ protected abstract void VisitEndOfDataField(ReadOnlySpan chunk);
+
+ ///
+ /// Notifies that all fields in the current non-header record have been visited.
+ ///
+ ///
+ /// See documentation for for
+ /// details about when and how this method will be called.
+ ///
+ protected abstract void VisitEndOfDataRecord();
+
+ ///
+ ///
+ /// Notifies that the current non-header record is about to be terminated without reading
+ /// all the fields that were identified in the header record.
+ ///
+ ///
+ /// The default behavior is to throw .
+ ///
+ ///
+ protected virtual void VisitMissingDataFields()
+ {
+ if (_headers.IsDefault)
+ {
+ // we will never do this, but a cheeky subclass might.
+ throw new InvalidOperationException("This method is only intended to be called by the base class.");
+ }
+
+ throw new CursivelyMissingDataFieldsException(_headers.Length, _currentFieldIndex);
+ }
+
+ ///
+ ///
+ /// Notifies that data for a field is about to be read on a non-header record, but all the
+ /// fields that were identified in the header record have already been read.
+ ///
+ ///
+ /// This method is called before every single or
+ /// call for fields not present in the header record.
+ ///
+ ///
+ /// The default behavior is to throw .
+ ///
+ ///
+ protected virtual void VisitUnexpectedDataField()
+ {
+ if (_headers.IsDefault)
+ {
+ // we will never do this, but a cheeky subclass might.
+ throw new InvalidOperationException("This method is only intended to be called by the base class.");
+ }
+
+ throw new CursivelyExtraDataFieldsException(_headers.Length);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe void VisitPartialFieldContentsSlow(ReadOnlySpan chunk)
+ {
+ if (_headers.IsDefault)
+ {
+ if (_headersBuilder.Capacity == _headersBuilder.Count)
+ {
+ throw new CursivelyTooManyHeadersException(_headersBuilder.Capacity);
+ }
+
+ if (chunk.IsEmpty)
+ {
+ // the tokenizer will never do this, but an external caller might.
+ return;
+ }
+
+ fixed (byte* b = &chunk[0])
+ {
+ VisitHeaderChunk(b, chunk.Length, false);
+ }
+ }
+ else
+ {
+ Debug.Assert(_currentFieldIndex >= _headers.Length, "Another condition brought us into VisitPartialFieldContentsSlow without updating this bit.");
+ VisitUnexpectedDataField();
+ VisitPartialDataFieldContents(chunk);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private unsafe void VisitEndOfFieldSlow(ReadOnlySpan chunk)
+ {
+ if (_headers.IsDefault)
+ {
+ if (_headersBuilder.Capacity == _headersBuilder.Count)
+ {
+ throw new CursivelyTooManyHeadersException(_headersBuilder.Capacity);
+ }
+
+ if (chunk.IsEmpty)
+ {
+ // the tokenizer will never do this, but an external caller might. note that
+ // the Decoder methods require a non-null pointer, even if the length is zero.
+ byte b = 0xFF;
+ VisitHeaderChunk(&b, 0, true);
+ }
+ else
+ {
+ fixed (byte* b = &chunk[0])
+ {
+ VisitHeaderChunk(b, chunk.Length, true);
+ }
+ }
+
+ int headerBufferOffset = 0;
+
+ if (_headersBuilder.Count == 0 &&
+ _ignoreUTF8IdentifierOnFirstHeaderField &&
+ _headerBufferConsumed > 0 &&
+ _headerBuffer[0] == '\uFEFF')
+ {
+ headerBufferOffset = 1;
+ }
+
+ _headersBuilder.Add(new string(_headerBuffer, headerBufferOffset, _headerBufferConsumed - headerBufferOffset));
+ _headerBufferConsumed = 0;
+ ++_currentFieldIndex;
+ }
+ else
+ {
+ Debug.Assert(_currentFieldIndex >= _headers.Length, "Another condition brought us into VisitEndOfFieldSlow without updating this bit.");
+ VisitUnexpectedDataField();
+ VisitEndOfDataField(chunk);
+ _currentFieldIndex = checked(_currentFieldIndex + 1);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private void VisitEndOfRecordSlow()
+ {
+ if (_headers.IsDefault)
+ {
+ if (_headersBuilder.Count == 0)
+ {
+ // the tokenizer will never do this, but an external caller might.
+ throw new InvalidOperationException("No fields were present in the header record.");
+ }
+
+ // this is almost equivalent to setting _headers = _headersBuilder.ToImmutable(),
+ // but this does a better job rewarding people for setting the max field count to
+ // the actual field count, which will often be the case.
+ _headersBuilder.Capacity = _headersBuilder.Count;
+ _headers = _headersBuilder.MoveToImmutable();
+ _currentFieldIndex = _headers.Length;
+
+ // we're done building headers, so free up our buffer.
+ _headerBuffer = null;
+
+ // let the subclass know that the headers are ready, in case it wants to set up some
+ // stuff before the field data starts rolling in.
+ VisitEndOfHeaderRecord();
+ }
+ else
+ {
+ Debug.Assert(_currentFieldIndex != _headers.Length, "Another condition brought us into VisitEndOfRecordSlow without updating this bit.");
+ if (_currentFieldIndex < _headers.Length)
+ {
+ VisitMissingDataFields();
+ }
+
+ VisitEndOfDataRecord();
+ }
+
+ _currentFieldIndex = 0;
+ }
+
+ private unsafe void VisitHeaderChunk(byte* b, int byteCount, bool flush)
+ {
+ int charCount = _headerDecoder.GetCharCount(b, byteCount, flush);
+ if (_headerBufferConsumed + charCount <= _headerBuffer.Length)
+ {
+ fixed (char* c = &_headerBuffer[_headerBufferConsumed])
+ {
+ _headerDecoder.GetChars(b, byteCount, c, charCount, flush);
+ }
+ }
+ else
+ {
+ throw new CursivelyHeaderIsTooLongException(_headerBuffer.Length);
+ }
+
+ _headerBufferConsumed += charCount;
+ }
+ }
+}
diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs
index 8dd039e..0595009 100644
--- a/src/Cursively/CsvTokenizer.cs
+++ b/src/Cursively/CsvTokenizer.cs
@@ -142,20 +142,50 @@ namespace Cursively
///
public class CsvTokenizer
{
- private const byte COMMA = (byte)',';
-
private const byte CR = (byte)'\r';
private const byte LF = (byte)'\n';
private const byte QUOTE = (byte)'"';
- private static readonly byte[] AllStopBytes = { COMMA, QUOTE, CR, LF };
-
- private static readonly byte[] AllStopBytesExceptQuote = { COMMA, CR, LF };
+ private readonly byte _delimiter;
private ParserFlags _parserFlags;
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ public CsvTokenizer()
+ : this((byte)',')
+ {
+ }
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ ///
+ /// The single byte to expect to see between fields of the same record. This may not be an
+ /// end-of-line or double-quote character, as those have special meanings.
+ ///
+ ///
+ /// Thrown when is 0x0A, 0x0D, or
+ /// 0x22.
+ ///
+ public CsvTokenizer(byte delimiter)
+ {
+ switch (delimiter)
+ {
+ case CR:
+ case LF:
+ case QUOTE:
+ throw new ArgumentException("Must not be a carriage return, linefeed, or double-quote.", nameof(delimiter));
+
+ default:
+ _delimiter = delimiter;
+ break;
+ }
+ }
+
[Flags]
private enum ParserFlags : byte
{
@@ -189,8 +219,7 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi
visitor = CsvReaderVisitorBase.Null;
}
- // cache the implicit conversion for the sake of "portable span" targets.
- ReadOnlySpan allStopBytes = AllStopBytes;
+ byte delimiter = _delimiter;
// we're going to consume the entire buffer that was handed to us.
while (!chunk.IsEmpty)
@@ -204,17 +233,24 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi
continue;
}
- int idx = chunk.IndexOfAny(allStopBytes);
- if (idx < 0)
- {
- visitor.VisitPartialFieldContents(chunk);
- _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
- break;
- }
-
- switch (chunk[idx])
+ // loop one-by-one, instead of doing an IndexOfAny, greedily assuming that the most
+ // performance-sensitive applications will tend to have few enough bytes in each
+ // unquoted field that this manual inlining will benefit those applications **much**
+ // more than practically any IndexOfAny implementation would.
+ for (int idx = 0; idx < chunk.Length; idx++)
{
- case QUOTE:
+ byte c = chunk[idx];
+ if (c == delimiter)
+ {
+ _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
+ visitor.VisitEndOfField(chunk.Slice(0, idx));
+ }
+ else if (c == CR || c == LF)
+ {
+ ProcessEndOfRecord(chunk.Slice(0, idx), visitor);
+ }
+ else if (c == QUOTE)
+ {
if (idx == 0)
{
_parserFlags = ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
@@ -224,23 +260,27 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi
// RFC 4180 forbids quotes that show up anywhere but the beginning of a
// field, so it's up to us to decide what we want to do about this. We
// choose to treat all such quotes as just regular data.
- visitor.VisitPartialFieldContents(chunk.Slice(0, idx + 1));
_parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
- }
-
- break;
-
- case COMMA:
- visitor.VisitEndOfField(chunk.Slice(0, idx));
- _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
- break;
+ visitor.VisitPartialFieldContents(chunk.Slice(0, idx + 1));
- default:
- ProcessEndOfLine(chunk.Slice(0, idx), visitor);
- break;
+ // let the visitor know that this was nonstandard.
+ visitor.VisitNonstandardQuotedField();
+ }
+ }
+ else
+ {
+ continue;
+ }
+
+ chunk = chunk.Slice(idx + 1);
+ goto nextLoop;
}
- chunk = chunk.Slice(idx + 1);
+ _parserFlags = ParserFlags.ReadAnythingInCurrentField | ParserFlags.ReadAnythingOnCurrentLine;
+ visitor.VisitPartialFieldContents(chunk);
+ break;
+
+ nextLoop:;
}
}
@@ -266,18 +306,12 @@ public void ProcessEndOfStream(CsvReaderVisitorBase visitor)
visitor = CsvReaderVisitorBase.Null;
}
- ProcessEndOfLine(default, visitor);
+ ProcessEndOfRecord(default, visitor);
}
private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisitorBase visitor)
{
- if ((_parserFlags & ParserFlags.CutAtPotentiallyTerminalDoubleQuote) != 0)
- {
- HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref readBuffer, visitor);
- return;
- }
-
- if ((_parserFlags & (ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.QuotedFieldDataEnded)) == ParserFlags.CurrentFieldStartedWithQuote)
+ if ((_parserFlags & (ParserFlags.CurrentFieldStartedWithQuote | ParserFlags.QuotedFieldDataEnded | ParserFlags.CutAtPotentiallyTerminalDoubleQuote)) == ParserFlags.CurrentFieldStartedWithQuote)
{
int idx = readBuffer.IndexOf(QUOTE);
if (idx < 0)
@@ -296,76 +330,87 @@ private void PickUpFromLastTime(ref ReadOnlySpan readBuffer, CsvReaderVisi
// in fact, it should pay off so well in so many cases that we can probably even
// get away with making the other case really suboptimal, which is what it will
// do when we pick up where we leave off after setting this flag.
- visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx));
_parserFlags |= ParserFlags.CutAtPotentiallyTerminalDoubleQuote;
+ visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx));
readBuffer = default;
return;
}
// we have at least one more byte, so let's see what the double quote actually means
- switch (readBuffer[idx + 1])
+ byte b = readBuffer[idx + 1];
+ if (b == QUOTE)
{
- case QUOTE:
- // the double quote we stopped at was escaping a literal double quote, so we
- // send everything up to and including the escaping quote.
- visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx + 1));
- break;
-
- case COMMA:
- // the double quote was the end of a quoted field, so send the entire data
- // from the beginning of this quoted field data chunk up to the double quote
- // that terminated it (excluding, of course, the double quote itself).
- visitor.VisitEndOfField(readBuffer.Slice(0, idx));
- _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
- break;
-
- case CR:
- case LF:
- // same thing as the COMMA case, just the field ended at the end of a line
- // instead of the end of a field on the current line.
- ProcessEndOfLine(readBuffer.Slice(0, idx), visitor);
- break;
-
- default:
- // the double quote was the end of the quoted part of the field data, but
- // then it continues on with more data; don't spend too much time optimizing
- // this case since it's not RFC 4180, just do the parts we need to do in
- // order to behave the way we said we would.
- _parserFlags |= ParserFlags.QuotedFieldDataEnded;
- visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx));
- visitor.VisitPartialFieldContents(readBuffer.Slice(idx + 1, 1));
- break;
+ // the double quote we stopped at was escaping a literal double quote, so we
+ // send everything up to and including the escaping quote.
+ visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx + 1));
+ }
+ else if (b == _delimiter)
+ {
+ // the double quote was the end of a quoted field, so send the entire data from
+ // the beginning of this quoted field data chunk up to the double quote that
+ // terminated it (excluding, of course, the double quote itself).
+ _parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
+ visitor.VisitEndOfField(readBuffer.Slice(0, idx));
+ }
+ else if (b == CR || b == LF)
+ {
+ // same thing as the delimiter case, just the field ended at the end of a line
+ // instead of the end of a field on the current line.
+ ProcessEndOfRecord(readBuffer.Slice(0, idx), visitor);
+ }
+ else
+ {
+ // the double quote was the end of the quoted part of the field data, but then
+ // it continues on with more data; don't spend too much time optimizing this
+ // case since it's not RFC 4180, just do the parts we need to do in order to
+ // behave the way we said we would.
+ _parserFlags |= ParserFlags.QuotedFieldDataEnded;
+ visitor.VisitPartialFieldContents(readBuffer.Slice(0, idx));
+ visitor.VisitPartialFieldContents(readBuffer.Slice(idx + 1, 1));
+
+ // let the visitor know that this was nonstandard.
+ visitor.VisitNonstandardQuotedField();
}
// slice off the data up to the quote and the next byte that we read.
readBuffer = readBuffer.Slice(idx + 2);
- return;
}
-
- // this is expected to be rare: either we were cut between field reads, or we're reading
- // nonstandard field data where there's a quote that neither starts nor ends the field.
+ else
{
- int idx = readBuffer.IndexOfAny(AllStopBytesExceptQuote);
- if (idx < 0)
+ // this is expected to be rare: either we were cut between field reads, or we're
+ // reading nonstandard field data where there's a quote that neither starts nor ends
+ // the field; by this point, we don't save enough state to remember which case we're
+ // in, so VisitNonstandardQuotedField **MUST** have been correctly called (or not)
+ // before entering this section.
+ if ((_parserFlags & ParserFlags.CutAtPotentiallyTerminalDoubleQuote) != 0)
{
- visitor.VisitPartialFieldContents(readBuffer);
- readBuffer = default;
+ HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref readBuffer, visitor);
return;
}
- switch (readBuffer[idx])
+ for (int idx = 0; idx < readBuffer.Length; idx++)
{
- case COMMA:
- visitor.VisitEndOfField(readBuffer.Slice(0, idx));
+ byte b = readBuffer[idx];
+ if (b == _delimiter)
+ {
_parserFlags = ParserFlags.ReadAnythingOnCurrentLine;
- break;
-
- default:
- ProcessEndOfLine(readBuffer.Slice(0, idx), visitor);
- break;
+ visitor.VisitEndOfField(readBuffer.Slice(0, idx));
+ }
+ else if (b == CR || b == LF)
+ {
+ ProcessEndOfRecord(readBuffer.Slice(0, idx), visitor);
+ }
+ else
+ {
+ continue;
+ }
+
+ readBuffer = readBuffer.Slice(idx + 1);
+ return;
}
- readBuffer = readBuffer.Slice(idx + 1);
+ visitor.VisitPartialFieldContents(readBuffer);
+ readBuffer = default;
}
}
@@ -379,39 +424,48 @@ private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpan lastFieldDataChunk, CsvReaderVisitorBase visitor)
+ private void ProcessEndOfRecord(ReadOnlySpan lastFieldDataChunk, CsvReaderVisitorBase visitor)
{
- if (!lastFieldDataChunk.IsEmpty || (_parserFlags & ParserFlags.ReadAnythingOnCurrentLine) != 0)
+ // even if the last field data chunk is empty, we still need to send it: we might be
+ // looking at a newline that immediately follows a comma, which is defined to mean
+ // an empty field at the end of a line.
+ bool notify = !lastFieldDataChunk.IsEmpty || (_parserFlags & ParserFlags.ReadAnythingOnCurrentLine) != 0;
+
+ _parserFlags = ParserFlags.None;
+ if (notify)
{
- // even if the last field data chunk is empty, we still need to send it: we might be
- // looking at a newline that immediately follows a comma, which is defined to mean
- // an empty field at the end of a line.
visitor.VisitEndOfField(lastFieldDataChunk);
visitor.VisitEndOfRecord();
}
-
- _parserFlags = ParserFlags.None;
}
}
}
diff --git a/src/Cursively/Cursively.csproj b/src/Cursively/Cursively.csproj
index a702484..ebdabd9 100644
--- a/src/Cursively/Cursively.csproj
+++ b/src/Cursively/Cursively.csproj
@@ -2,6 +2,7 @@
netstandard2.0
+ true
@@ -15,7 +16,9 @@
+
+
diff --git a/src/Cursively/CursivelyDataStreamException.cs b/src/Cursively/CursivelyDataStreamException.cs
new file mode 100644
index 0000000..9926d37
--- /dev/null
+++ b/src/Cursively/CursivelyDataStreamException.cs
@@ -0,0 +1,31 @@
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.Serialization;
+
+namespace Cursively
+{
+ ///
+ /// Serves as the base class for exceptions thrown by this library to indicate problems with the
+ /// actual contents of a CSV stream.
+ ///
+ [Serializable]
+ [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")]
+ public abstract class CursivelyDataStreamException : Exception
+ {
+ private protected CursivelyDataStreamException(string message)
+ : base(message)
+ {
+ }
+
+ private protected CursivelyDataStreamException(string message, Exception innerException)
+ : base(message, innerException)
+ {
+ }
+
+ [SuppressMessage("Microsoft.Usage", "CA2229:ImplementSerializationConstructors")]
+ private protected CursivelyDataStreamException(SerializationInfo info, StreamingContext context)
+ : base(info, context)
+ {
+ }
+ }
+}
diff --git a/src/Cursively/CursivelyDecoderExceptionFallback.cs b/src/Cursively/CursivelyDecoderExceptionFallback.cs
new file mode 100644
index 0000000..22192ea
--- /dev/null
+++ b/src/Cursively/CursivelyDecoderExceptionFallback.cs
@@ -0,0 +1,42 @@
+using System.Text;
+
+namespace Cursively
+{
+ internal sealed class CursivelyDecoderExceptionFallback : DecoderFallback
+ {
+ public override int MaxCharCount => 0;
+
+ public override DecoderFallbackBuffer CreateFallbackBuffer() => new CursivelyDecoderExceptionFallbackBuffer();
+
+ public override bool Equals(object obj) => obj is CursivelyDecoderExceptionFallback;
+
+ public override int GetHashCode() => 1234;
+
+ private sealed class CursivelyDecoderExceptionFallbackBuffer : DecoderFallbackBuffer
+ {
+ public override int Remaining => 0;
+
+ public override char GetNextChar() => '\0';
+
+ public override bool MovePrevious() => false;
+
+ public override bool Fallback(byte[] bytesUnknown, int index)
+ {
+ // use the built-in logic to get a helpful exception message.
+ var inner = new DecoderExceptionFallbackBuffer();
+ try
+ {
+ return inner.Fallback(bytesUnknown, index);
+ }
+ catch (DecoderFallbackException ex)
+ {
+ // wrap it. C# / .NET do not support multiple inheritance, and I think it's
+ // more important for consumers to be able to catch CursivelyDataStreamException
+ // for all exceptions in the form of "this breaks one of Cursively's rules, but
+ // the system is otherwise operating normally".
+ throw new CursivelyHeadersAreNotUTF8Exception(ex);
+ }
+ }
+ }
+ }
+}
diff --git a/src/Cursively/CursivelyExtraDataFieldsException.cs b/src/Cursively/CursivelyExtraDataFieldsException.cs
new file mode 100644
index 0000000..71996b1
--- /dev/null
+++ b/src/Cursively/CursivelyExtraDataFieldsException.cs
@@ -0,0 +1,25 @@
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.Serialization;
+
+namespace Cursively
+{
+ ///
+ /// Raised by , by default, when a data record
+ /// contains more fields than the header record.
+ ///
+ [Serializable]
+ [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")]
+ public sealed class CursivelyExtraDataFieldsException : CursivelyDataStreamException
+ {
+ internal CursivelyExtraDataFieldsException(int headerFieldCount)
+ : base($"CSV stream contains a non-header record with more fields than the {headerFieldCount} field(s) present in the header record.")
+ {
+ }
+
+ private CursivelyExtraDataFieldsException(SerializationInfo info, StreamingContext context)
+ : base(info, context)
+ {
+ }
+ }
+}
diff --git a/src/Cursively/CursivelyHeaderIsTooLongException.cs b/src/Cursively/CursivelyHeaderIsTooLongException.cs
new file mode 100644
index 0000000..882098e
--- /dev/null
+++ b/src/Cursively/CursivelyHeaderIsTooLongException.cs
@@ -0,0 +1,25 @@
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.Serialization;
+
+namespace Cursively
+{
+ ///
+ /// Raised by when the length of a header
+ /// exceeds the configured maximum.
+ ///
+ [Serializable]
+ [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")]
+ public sealed class CursivelyHeaderIsTooLongException : CursivelyDataStreamException
+ {
+ internal CursivelyHeaderIsTooLongException(int maxLength)
+ : base($"CSV stream contains a header that is longer than the configured max length of {maxLength}.")
+ {
+ }
+
+ private CursivelyHeaderIsTooLongException(SerializationInfo info, StreamingContext context)
+ : base(info, context)
+ {
+ }
+ }
+}
diff --git a/src/Cursively/CursivelyHeadersAreNotUTF8Exception.cs b/src/Cursively/CursivelyHeadersAreNotUTF8Exception.cs
new file mode 100644
index 0000000..283068b
--- /dev/null
+++ b/src/Cursively/CursivelyHeadersAreNotUTF8Exception.cs
@@ -0,0 +1,32 @@
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.Serialization;
+using System.Text;
+
+namespace Cursively
+{
+ ///
+ /// Raised by , by default, when the header
+ /// record contains invalid UTF-8 bytes.
+ ///
+ [Serializable]
+ [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")]
+ public sealed class CursivelyHeadersAreNotUTF8Exception : CursivelyDataStreamException
+ {
+ internal CursivelyHeadersAreNotUTF8Exception(DecoderFallbackException innerException)
+ : base(innerException.Message, innerException)
+ {
+ }
+
+ private CursivelyHeadersAreNotUTF8Exception(SerializationInfo info, StreamingContext context)
+ : base(info, context)
+ {
+ }
+
+ ///
+ /// Gets the instance that holds the actual decoder
+ /// state when the current exception was raised.
+ ///
+ public DecoderFallbackException InnerDecoderFallbackException => (DecoderFallbackException)InnerException;
+ }
+}
diff --git a/src/Cursively/CursivelyMissingDataFieldsException.cs b/src/Cursively/CursivelyMissingDataFieldsException.cs
new file mode 100644
index 0000000..03c776d
--- /dev/null
+++ b/src/Cursively/CursivelyMissingDataFieldsException.cs
@@ -0,0 +1,25 @@
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.Serialization;
+
+namespace Cursively
+{
+ ///
+ /// Raised by , by default, when a data record
+ /// contains fewer fields than the header record.
+ ///
+ [Serializable]
+ [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")]
+ public sealed class CursivelyMissingDataFieldsException : CursivelyDataStreamException
+ {
+ internal CursivelyMissingDataFieldsException(int headerFieldCount, int dataFieldCount)
+ : base($"CSV stream contains a non-header record with only {dataFieldCount} field(s), fewer than the {headerFieldCount} field(s) present in the header record.")
+ {
+ }
+
+ private CursivelyMissingDataFieldsException(SerializationInfo info, StreamingContext context)
+ : base(info, context)
+ {
+ }
+ }
+}
diff --git a/src/Cursively/CursivelyTooManyHeadersException.cs b/src/Cursively/CursivelyTooManyHeadersException.cs
new file mode 100644
index 0000000..ad5f876
--- /dev/null
+++ b/src/Cursively/CursivelyTooManyHeadersException.cs
@@ -0,0 +1,25 @@
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.Serialization;
+
+namespace Cursively
+{
+ ///
+ /// Raised by when the number of headers
+ /// exceeds the configured maximum.
+ ///
+ [Serializable]
+ [SuppressMessage("Microsoft.Design", "CA1032:ImplementStandardExceptionConstructors")]
+ public sealed class CursivelyTooManyHeadersException : CursivelyDataStreamException
+ {
+ internal CursivelyTooManyHeadersException(int maxHeaderCount)
+ : base($"CSV stream contains more headers than the configured maximum of {maxHeaderCount}.")
+ {
+ }
+
+ private CursivelyTooManyHeadersException(SerializationInfo info, StreamingContext context)
+ : base(info, context)
+ {
+ }
+ }
+}
diff --git a/src/Directory.Build.props b/src/Directory.Build.props
index a83c324..4b420f8 100644
--- a/src/Directory.Build.props
+++ b/src/Directory.Build.props
@@ -21,8 +21,11 @@
false
-
+
+
+
+
diff --git a/test/Cursively.Benchmark/.gitignore b/test/Cursively.Benchmark/.gitignore
new file mode 100644
index 0000000..ff63206
--- /dev/null
+++ b/test/Cursively.Benchmark/.gitignore
@@ -0,0 +1 @@
+large-csv-files/*.csv
diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs
index 38d3d9c..c946b5a 100644
--- a/test/Cursively.Benchmark/Program.cs
+++ b/test/Cursively.Benchmark/Program.cs
@@ -1,5 +1,6 @@
using System;
using System.IO;
+using System.IO.Compression;
using System.Runtime.CompilerServices;
using System.Text;
@@ -13,7 +14,6 @@ namespace Cursively.Benchmark
{
[ClrJob]
[CoreJob]
- [CoreRtJob]
[GcServer(true)]
[MemoryDiagnoser]
public class Program
@@ -22,15 +22,6 @@ public class Program
[Benchmark(Baseline = true)]
[ArgumentsSource(nameof(CsvFiles))]
- public void NopUsingCursively(CsvFile csvFile)
- {
- var tokenizer = new CsvTokenizer();
- tokenizer.ProcessNextChunk(csvFile.FileData, null);
- tokenizer.ProcessEndOfStream(null);
- }
-
- [Benchmark]
- [ArgumentsSource(nameof(CsvFiles))]
public long CountRowsUsingCursively(CsvFile csvFile)
{
var visitor = new RowCountingVisitor();
@@ -63,7 +54,8 @@ private static int Main()
var prog = new Program();
foreach (var csvFile in CsvFiles)
{
- if (prog.CountRowsUsingCursively(csvFile) != prog.CountRowsUsingCsvHelper(csvFile))
+ long rowCount = prog.CountRowsUsingCursively(csvFile);
+ if (prog.CountRowsUsingCsvHelper(csvFile) != rowCount)
{
Console.Error.WriteLine($"Failed on {csvFile}.");
return 1;
@@ -88,14 +80,29 @@ public CsvFile(string fullPath) =>
public override string ToString() => FileName;
}
- private static CsvFile[] GetCsvFiles([CallerFilePath]string myLocation = null) =>
- Array.ConvertAll(Directory.GetFiles(Path.Combine(Path.GetDirectoryName(myLocation), "large-csv-files"), "*.csv"),
- fullPath => new CsvFile(fullPath));
+ private static CsvFile[] GetCsvFiles([CallerFilePath]string myLocation = null)
+ {
+ string csvFileDirectoryPath = Path.Combine(Path.GetDirectoryName(myLocation), "large-csv-files");
+ if (!Directory.Exists(csvFileDirectoryPath))
+ {
+ string tmpDirectoryPath = csvFileDirectoryPath + "-tmp";
+ if (Directory.Exists(tmpDirectoryPath))
+ {
+ Directory.Delete(tmpDirectoryPath, true);
+ }
+
+ string zipFilePath = csvFileDirectoryPath + ".zip";
+ Directory.CreateDirectory(tmpDirectoryPath);
+ ZipFile.ExtractToDirectory(zipFilePath, tmpDirectoryPath);
+ Directory.Move(tmpDirectoryPath, csvFileDirectoryPath);
+ }
+
+ return Array.ConvertAll(Directory.GetFiles(csvFileDirectoryPath, "*.csv"),
+ fullPath => new CsvFile(fullPath));
+ }
private sealed class RowCountingVisitor : CsvReaderVisitorBase
{
- public long CharCount { get; private set; }
-
public long RowCount { get; private set; }
public override void VisitEndOfRecord() => ++RowCount;
diff --git a/test/Cursively.Benchmark/large-csv-files.zip b/test/Cursively.Benchmark/large-csv-files.zip
new file mode 100644
index 0000000..2296075
--- /dev/null
+++ b/test/Cursively.Benchmark/large-csv-files.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc70c8d20921546b1fa4e587859a22a2edfce76325d8e5bc780b98a78409a76d
+size 47747942
diff --git a/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv b/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv
deleted file mode 100644
index 718947c..0000000
--- a/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:611a7ba4f69bf3ab34f1fbf3fbf4711bfa8fb91a210683bdf4c1915818f1cfe0
-size 4900444
diff --git a/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv b/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv
deleted file mode 100644
index fde3ed5..0000000
--- a/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3e82c977d84c24a6b16063b634cbeab1e8409b34724b0ecf07893f45f8aadb53
-size 2900444
diff --git a/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv b/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv
deleted file mode 100644
index 61dd063..0000000
--- a/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6f1e211bf4eb14ab578ccf6aff141e8db41e80314b39b85fba5f047830f746e4
-size 10020000
diff --git a/test/Cursively.Benchmark/large-csv-files/mocked.csv b/test/Cursively.Benchmark/large-csv-files/mocked.csv
deleted file mode 100644
index 4b45c74..0000000
--- a/test/Cursively.Benchmark/large-csv-files/mocked.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e01c74f0a9622e4ad72233ff35bfcc2663eca10b558d0d7e7f71932c6c981d4b
-size 12731500
diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs
index 0c18b40..77f951e 100644
--- a/test/Cursively.Tests/CsvTokenizerTests.cs
+++ b/test/Cursively.Tests/CsvTokenizerTests.cs
@@ -17,56 +17,164 @@ public sealed class CsvTokenizerTests
private static readonly int[] TestChunkLengths = { 1, 2, 3, 5, 8, 13, 21, 34 };
+ private static readonly byte[] TestDelimiters = { (byte)',', (byte)'\t' };
+
public static IEnumerable
-
+
diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/invalid-utf8-in-header.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/invalid-utf8-in-header.csv
new file mode 100644
index 0000000..7ff9b39
--- /dev/null
+++ b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/invalid-utf8-in-header.csv
@@ -0,0 +1,2 @@
+é,
+a,
\ No newline at end of file
diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/missing-data-fields.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/missing-data-fields.csv
new file mode 100644
index 0000000..7cc53b4
--- /dev/null
+++ b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/missing-data-fields.csv
@@ -0,0 +1,3 @@
+a,b,c
+1,2,3
+1,2
\ No newline at end of file
diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/too-many-data-fields.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/too-many-data-fields.csv
new file mode 100644
index 0000000..636ad5f
--- /dev/null
+++ b/test/Cursively.Tests/TestCsvFiles/with-headers/invalid/too-many-data-fields.csv
@@ -0,0 +1,2 @@
+"a","b","c"
+"a","b","c",
\ No newline at end of file
diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/valid/invalid-utf8-outside-header.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/valid/invalid-utf8-outside-header.csv
new file mode 100644
index 0000000..ee4f68d
--- /dev/null
+++ b/test/Cursively.Tests/TestCsvFiles/with-headers/valid/invalid-utf8-outside-header.csv
@@ -0,0 +1,2 @@
+a,
+é,
\ No newline at end of file
diff --git a/test/Cursively.Tests/TestCsvFiles/with-headers/valid/simple.csv b/test/Cursively.Tests/TestCsvFiles/with-headers/valid/simple.csv
new file mode 100644
index 0000000..b4819c6
--- /dev/null
+++ b/test/Cursively.Tests/TestCsvFiles/with-headers/valid/simple.csv
@@ -0,0 +1,3 @@
+A,B,C
+1,2,3
+do,re,mi
\ No newline at end of file