From 348a1908e6d26fb5bdb8beaecdfa1bac0e64c6a9 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Tue, 21 May 2019 22:57:30 -0400 Subject: [PATCH 01/22] Use GitVersion. --- Cursively.sln | 28 +++++++++---------- Directory.Build.props | 5 ++++ src/Directory.Build.props | 5 ++++ .../Cursively.Benchmark.csproj | 2 -- {src => test}/Cursively.Benchmark/Program.cs | 0 .../large-csv-files/mocked.csv | 0 test/Directory.Build.props | 4 +++ 7 files changed, 28 insertions(+), 16 deletions(-) rename {src => test}/Cursively.Benchmark/Cursively.Benchmark.csproj (93%) rename {src => test}/Cursively.Benchmark/Program.cs (100%) rename {src => test}/Cursively.Benchmark/large-csv-files/mocked.csv (100%) diff --git a/Cursively.sln b/Cursively.sln index 4c11141..db29ccb 100644 --- a/Cursively.sln +++ b/Cursively.sln @@ -5,10 +5,10 @@ VisualStudioVersion = 16.0.28803.452 MinimumVisualStudioVersion = 15.0.26124.0 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cursively", "src\Cursively\Cursively.csproj", "{C67FC045-A9A8-4A97-B6AA-72503EEEA9FB}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cursively.Benchmark", "src\Cursively.Benchmark\Cursively.Benchmark.csproj", "{B97EC4E4-F878-4595-BD80-7B30CBAAF986}" -EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cursively.Tests", "test\Cursively.Tests\Cursively.Tests.csproj", "{029F8A31-11BF-490B-9413-AD77D1B5FDAB}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cursively.Benchmark", "test\Cursively.Benchmark\Cursively.Benchmark.csproj", "{D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -31,18 +31,6 @@ Global {C67FC045-A9A8-4A97-B6AA-72503EEEA9FB}.Release|x64.Build.0 = Release|Any CPU {C67FC045-A9A8-4A97-B6AA-72503EEEA9FB}.Release|x86.ActiveCfg = Release|Any CPU {C67FC045-A9A8-4A97-B6AA-72503EEEA9FB}.Release|x86.Build.0 = Release|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Debug|Any CPU.Build.0 = Debug|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Debug|x64.ActiveCfg = Debug|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Debug|x64.Build.0 = Debug|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Debug|x86.ActiveCfg = Debug|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Debug|x86.Build.0 = Debug|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Release|Any CPU.ActiveCfg = Release|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Release|Any CPU.Build.0 = Release|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Release|x64.ActiveCfg = Release|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Release|x64.Build.0 = Release|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Release|x86.ActiveCfg = Release|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Release|x86.Build.0 = Release|Any CPU {029F8A31-11BF-490B-9413-AD77D1B5FDAB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {029F8A31-11BF-490B-9413-AD77D1B5FDAB}.Debug|Any CPU.Build.0 = Debug|Any CPU {029F8A31-11BF-490B-9413-AD77D1B5FDAB}.Debug|x64.ActiveCfg = Debug|Any CPU @@ -55,6 +43,18 @@ Global {029F8A31-11BF-490B-9413-AD77D1B5FDAB}.Release|x64.Build.0 = Release|Any CPU {029F8A31-11BF-490B-9413-AD77D1B5FDAB}.Release|x86.ActiveCfg = Release|Any CPU {029F8A31-11BF-490B-9413-AD77D1B5FDAB}.Release|x86.Build.0 = Release|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Debug|x64.ActiveCfg = Debug|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Debug|x64.Build.0 = Debug|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Debug|x86.ActiveCfg = Debug|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Debug|x86.Build.0 = Debug|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Release|Any CPU.Build.0 = Release|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Release|x64.ActiveCfg = Release|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Release|x64.Build.0 = Release|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Release|x86.ActiveCfg = Release|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/Directory.Build.props b/Directory.Build.props index 3d31492..c9cc706 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -8,6 +8,11 @@ true 7.3 + true + + + + true diff --git a/src/Directory.Build.props b/src/Directory.Build.props index eec04f4..30ea211 100644 --- a/src/Directory.Build.props +++ b/src/Directory.Build.props @@ -7,4 +7,9 @@ true + + + + + diff --git a/src/Cursively.Benchmark/Cursively.Benchmark.csproj b/test/Cursively.Benchmark/Cursively.Benchmark.csproj similarity index 93% rename from src/Cursively.Benchmark/Cursively.Benchmark.csproj rename to test/Cursively.Benchmark/Cursively.Benchmark.csproj index 122f96c..4620fe7 100644 --- a/src/Cursively.Benchmark/Cursively.Benchmark.csproj +++ b/test/Cursively.Benchmark/Cursively.Benchmark.csproj @@ -4,8 +4,6 @@ Exe netcoreapp2.2;net472 true - - false diff --git a/src/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs similarity index 100% rename from src/Cursively.Benchmark/Program.cs rename to test/Cursively.Benchmark/Program.cs diff --git a/src/Cursively.Benchmark/large-csv-files/mocked.csv b/test/Cursively.Benchmark/large-csv-files/mocked.csv similarity index 100% rename from src/Cursively.Benchmark/large-csv-files/mocked.csv rename to test/Cursively.Benchmark/large-csv-files/mocked.csv diff --git a/test/Directory.Build.props b/test/Directory.Build.props index cbf08bc..9399748 100644 --- a/test/Directory.Build.props +++ b/test/Directory.Build.props @@ -3,4 +3,8 @@ + + False + + From 2d4483eb29a22d0d912a8dfaacee0932d0220767 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Tue, 21 May 2019 23:00:39 -0400 Subject: [PATCH 02/22] SourceLink all the things! --- Directory.Build.props | 4 ++++ src/Directory.Build.props | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Directory.Build.props b/Directory.Build.props index c9cc706..9da6d13 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -15,4 +15,8 @@ true + + + + diff --git a/src/Directory.Build.props b/src/Directory.Build.props index 30ea211..be95a54 100644 --- a/src/Directory.Build.props +++ b/src/Directory.Build.props @@ -8,7 +8,6 @@ - From d6e64f93617a4647b6f2d18fd220adfc5db77ee1 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Tue, 21 May 2019 23:10:34 -0400 Subject: [PATCH 03/22] packaging stuff --- src/Cursively/Cursively.csproj | 9 +++++++++ src/Directory.Build.props | 10 +++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/Cursively/Cursively.csproj b/src/Cursively/Cursively.csproj index 2e0fd0c..f271121 100644 --- a/src/Cursively/Cursively.csproj +++ b/src/Cursively/Cursively.csproj @@ -4,6 +4,15 @@ netstandard2.0 + + Cursively + Cursively - Fast CSV Processing + airbreather + airbreather + MIT + csv;comma;separated;value + + diff --git a/src/Directory.Build.props b/src/Directory.Build.props index be95a54..7c54da0 100644 --- a/src/Directory.Build.props +++ b/src/Directory.Build.props @@ -4,7 +4,15 @@ - true + true + + airbreather + Copyright © 2019 - $([System.DateTime]::UtcNow.Year) airbreather + + true + true + true + snupkg From aae7672a299c7acbe78d4790751559b9bf9f5ebd Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Tue, 21 May 2019 23:13:36 -0400 Subject: [PATCH 04/22] package meta --- src/Cursively/Cursively.csproj | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Cursively/Cursively.csproj b/src/Cursively/Cursively.csproj index f271121..a702484 100644 --- a/src/Cursively/Cursively.csproj +++ b/src/Cursively/Cursively.csproj @@ -7,6 +7,7 @@ Cursively Cursively - Fast CSV Processing + A .NET library for RFC4180-compliant CSV processing that's still fast and fault-tolerant. airbreather airbreather MIT From 2b5b56596e076137498666121aed8046cbbcc3d6 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Wed, 22 May 2019 08:33:55 -0400 Subject: [PATCH 05/22] Code Cleanup --- .editorconfig | 196 ++++++++++++++++++++++ src/Cursively/CsvTokenizer.cs | 8 +- test/Cursively.Benchmark/Program.cs | 12 +- test/Cursively.Tests/CsvTokenizerTests.cs | 4 +- 4 files changed, 207 insertions(+), 13 deletions(-) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..8719751 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,196 @@ +# Remove the line below if you want to inherit .editorconfig settings from higher directories +root = true + +# C# files +[*.cs] + +#### Core EditorConfig Options #### + +# Indentation and spacing +indent_size = 4 +indent_style = space +tab_width = 4 + +# New line preferences +end_of_line = crlf +insert_final_newline = true + +#### .NET Coding Conventions #### + +# Organize usings +dotnet_separate_import_directive_groups = true +dotnet_sort_system_directives_first = true + +# this. and Me. preferences +dotnet_style_qualification_for_event = false:silent +dotnet_style_qualification_for_field = false:silent +dotnet_style_qualification_for_method = false:silent +dotnet_style_qualification_for_property = false:silent + +# Language keywords vs BCL types preferences +dotnet_style_predefined_type_for_locals_parameters_members = true:warning +dotnet_style_predefined_type_for_member_access = true:warning + +# Parentheses preferences +dotnet_style_parentheses_in_arithmetic_binary_operators = always_for_clarity:silent +dotnet_style_parentheses_in_other_binary_operators = always_for_clarity:silent +dotnet_style_parentheses_in_other_operators = never_if_unnecessary:silent +dotnet_style_parentheses_in_relational_binary_operators = always_for_clarity:silent + +# Modifier preferences +dotnet_style_require_accessibility_modifiers = for_non_interface_members:silent + +# Expression-level preferences +csharp_style_deconstructed_variable_declaration = true:suggestion +csharp_style_inlined_variable_declaration = true:silent +csharp_style_throw_expression = true:warning +dotnet_style_coalesce_expression = true:error +dotnet_style_collection_initializer = true:warning +dotnet_style_explicit_tuple_names = true:error +dotnet_style_null_propagation = true:error +dotnet_style_object_initializer = true:silent +dotnet_style_prefer_auto_properties = true:silent +dotnet_style_prefer_compound_assignment = true:suggestion +dotnet_style_prefer_conditional_expression_over_assignment = true:silent +dotnet_style_prefer_conditional_expression_over_return = true:silent +dotnet_style_prefer_inferred_anonymous_type_member_names = true:suggestion +dotnet_style_prefer_inferred_tuple_names = true:suggestion +dotnet_style_prefer_is_null_check_over_reference_equality_method = true:suggestion + +# Field preferences +dotnet_style_readonly_field = true:suggestion + +# Parameter preferences +dotnet_code_quality_unused_parameters = all:suggestion + +#### C# Coding Conventions #### + +# var preferences +csharp_style_var_elsewhere = true:silent +csharp_style_var_for_built_in_types = false:silent +csharp_style_var_when_type_is_apparent = true:silent + +# Expression-bodied members +csharp_style_expression_bodied_accessors = true:suggestion +csharp_style_expression_bodied_constructors = true:suggestion +csharp_style_expression_bodied_indexers = true:suggestion +csharp_style_expression_bodied_lambdas = true:silent +csharp_style_expression_bodied_local_functions = false:silent +csharp_style_expression_bodied_methods = true:suggestion +csharp_style_expression_bodied_operators = true:suggestion +csharp_style_expression_bodied_properties = true:suggestion + +# Pattern matching preferences +csharp_style_pattern_matching_over_as_with_null_check = true:warning +csharp_style_pattern_matching_over_is_with_cast_check = true:warning + +# Null-checking preferences +csharp_style_conditional_delegate_call = true:silent + +# Modifier preferences +csharp_prefer_static_local_function = true:suggestion +csharp_preferred_modifier_order = public,private,protected,internal,static,extern,new,virtual,abstract,sealed,override,readonly,unsafe,volatile,async + +# Code-block preferences +csharp_prefer_braces = true:error +csharp_prefer_simple_using_statement = true:suggestion + +# Expression-level preferences +csharp_prefer_simple_default_expression = true:error +csharp_style_pattern_local_over_anonymous_function = true:suggestion +csharp_style_prefer_index_operator = true:suggestion +csharp_style_prefer_range_operator = true:suggestion +csharp_style_unused_value_assignment_preference = discard_variable:suggestion +csharp_style_unused_value_expression_statement_preference = discard_variable:silent + +# 'using' directive preferences +csharp_using_directive_placement = outside_namespace:silent + +#### C# Formatting Rules #### + +# New line preferences +csharp_new_line_before_catch = true +csharp_new_line_before_else = true +csharp_new_line_before_finally = true +csharp_new_line_before_members_in_anonymous_types = true +csharp_new_line_before_members_in_object_initializers = true +csharp_new_line_before_open_brace = all +csharp_new_line_between_query_expression_clauses = true + +# Indentation preferences +csharp_indent_block_contents = true +csharp_indent_braces = false +csharp_indent_case_contents = true +csharp_indent_case_contents_when_block = true +csharp_indent_labels = one_less_than_current +csharp_indent_switch_labels = true + +# Space preferences +csharp_space_after_cast = false +csharp_space_after_colon_in_inheritance_clause = true +csharp_space_after_comma = true +csharp_space_after_dot = false +csharp_space_after_keywords_in_control_flow_statements = true +csharp_space_after_semicolon_in_for_statement = true +csharp_space_around_binary_operators = before_and_after +csharp_space_around_declaration_statements = false +csharp_space_before_colon_in_inheritance_clause = true +csharp_space_before_comma = false +csharp_space_before_dot = false +csharp_space_before_open_square_brackets = false +csharp_space_before_semicolon_in_for_statement = false +csharp_space_between_empty_square_brackets = false +csharp_space_between_method_call_empty_parameter_list_parentheses = false +csharp_space_between_method_call_name_and_opening_parenthesis = false +csharp_space_between_method_call_parameter_list_parentheses = false +csharp_space_between_method_declaration_empty_parameter_list_parentheses = false +csharp_space_between_method_declaration_name_and_open_parenthesis = false +csharp_space_between_method_declaration_parameter_list_parentheses = false +csharp_space_between_parentheses = false +csharp_space_between_square_brackets = false + +# Wrapping preferences +csharp_preserve_single_line_blocks = true +csharp_preserve_single_line_statements = true + +#### Naming styles #### + +# Naming rules + +dotnet_naming_rule.interface_should_be_begins_with_i.severity = suggestion +dotnet_naming_rule.interface_should_be_begins_with_i.symbols = interface +dotnet_naming_rule.interface_should_be_begins_with_i.style = begins_with_i + +dotnet_naming_rule.types_should_be_pascal_case.severity = suggestion +dotnet_naming_rule.types_should_be_pascal_case.symbols = types +dotnet_naming_rule.types_should_be_pascal_case.style = pascal_case + +dotnet_naming_rule.non_field_members_should_be_pascal_case.severity = suggestion +dotnet_naming_rule.non_field_members_should_be_pascal_case.symbols = non_field_members +dotnet_naming_rule.non_field_members_should_be_pascal_case.style = pascal_case + +# Symbol specifications + +dotnet_naming_symbols.interface.applicable_kinds = interface +dotnet_naming_symbols.interface.applicable_accessibilities = public, internal, private, protected, protected_internal +dotnet_naming_symbols.interface.required_modifiers = + +dotnet_naming_symbols.types.applicable_kinds = class, struct, interface, enum +dotnet_naming_symbols.types.applicable_accessibilities = public, internal, private, protected, protected_internal +dotnet_naming_symbols.types.required_modifiers = + +dotnet_naming_symbols.non_field_members.applicable_kinds = property, event, method +dotnet_naming_symbols.non_field_members.applicable_accessibilities = public, internal, private, protected, protected_internal +dotnet_naming_symbols.non_field_members.required_modifiers = + +# Naming styles + +dotnet_naming_style.pascal_case.required_prefix = +dotnet_naming_style.pascal_case.required_suffix = +dotnet_naming_style.pascal_case.word_separator = +dotnet_naming_style.pascal_case.capitalization = pascal_case + +dotnet_naming_style.begins_with_i.required_prefix = I +dotnet_naming_style.begins_with_i.required_suffix = +dotnet_naming_style.begins_with_i.word_separator = +dotnet_naming_style.begins_with_i.capitalization = pascal_case diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs index e5c3fe7..cd003ee 100644 --- a/src/Cursively/CsvTokenizer.cs +++ b/src/Cursively/CsvTokenizer.cs @@ -146,10 +146,10 @@ public class CsvTokenizer private enum ParserFlags : byte { None, - ReadAnythingOnCurrentLine = 0b00000001, - ReadAnythingInCurrentField = 0b00000010, - CurrentFieldStartedWithQuote = 0b00000100, - QuotedFieldDataEnded = 0b00001000, + ReadAnythingOnCurrentLine = 0b00000001, + ReadAnythingInCurrentField = 0b00000010, + CurrentFieldStartedWithQuote = 0b00000100, + QuotedFieldDataEnded = 0b00001000, CutAtPotentiallyTerminalDoubleQuote = 0b00010000, } diff --git a/test/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs index 7a981ff..38d3d9c 100644 --- a/test/Cursively.Benchmark/Program.cs +++ b/test/Cursively.Benchmark/Program.cs @@ -58,7 +58,7 @@ public long CountRowsUsingCsvHelper(CsvFile csvFile) } } - static int Main() + private static int Main() { var prog = new Program(); foreach (var csvFile in CsvFiles) @@ -88,11 +88,9 @@ public CsvFile(string fullPath) => public override string ToString() => FileName; } - static CsvFile[] GetCsvFiles([CallerFilePath]string myLocation = null) - { - return Array.ConvertAll(Directory.GetFiles(Path.Combine(Path.GetDirectoryName(myLocation), "large-csv-files"), "*.csv"), - fullPath => new CsvFile(fullPath)); - } + private static CsvFile[] GetCsvFiles([CallerFilePath]string myLocation = null) => + Array.ConvertAll(Directory.GetFiles(Path.Combine(Path.GetDirectoryName(myLocation), "large-csv-files"), "*.csv"), + fullPath => new CsvFile(fullPath)); private sealed class RowCountingVisitor : CsvReaderVisitorBase { @@ -100,7 +98,7 @@ private sealed class RowCountingVisitor : CsvReaderVisitorBase public long RowCount { get; private set; } - public override void VisitEndOfRecord() => ++this.RowCount; + public override void VisitEndOfRecord() => ++RowCount; public override void VisitEndOfField(ReadOnlySpan chunk) { } diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs index f1ab8e9..0c18b40 100644 --- a/test/Cursively.Tests/CsvTokenizerTests.cs +++ b/test/Cursively.Tests/CsvTokenizerTests.cs @@ -94,7 +94,7 @@ private static IEnumerable TokenizeCsvFileUsingCsvHelper(byte[] csvDat private static byte[][] VaryLineEndings(ReadOnlySpan fileData, int randomSeed) { - var resultLists = new List[] + List[] resultLists = { new List(), new List(), @@ -104,7 +104,7 @@ private static byte[][] VaryLineEndings(ReadOnlySpan fileData, int randomS new List(), }; - var lineEndings = new byte[][] + byte[][] lineEndings = { new byte[] { (byte)'\r' }, new byte[] { (byte)'\n' }, From 174390ad82f90b72e1110c362a2b99e112948852 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Wed, 22 May 2019 08:39:03 -0400 Subject: [PATCH 06/22] Do warnings-as-errors properly. Use FxCop for things that get distributed. --- Directory.Build.props | 2 +- src/Directory.Build.props | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Directory.Build.props b/Directory.Build.props index 9da6d13..b6bbf76 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -4,7 +4,7 @@ $(MSBuildThisFileDirectory) - true + true true 7.3 diff --git a/src/Directory.Build.props b/src/Directory.Build.props index 7c54da0..3ef70bd 100644 --- a/src/Directory.Build.props +++ b/src/Directory.Build.props @@ -17,6 +17,7 @@ + From 3b303017daf8aa563cd0c26bf7c8c80d1e755477 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Thu, 23 May 2019 07:39:19 -0400 Subject: [PATCH 07/22] Update README.md --- README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e586d4c..f8d513b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,9 @@ # Cursively -A fast, RFC4180-compliant CSV reading library for .NET. Written in C#. +A fast, [RFC 4180](https://tools.ietf.org/html/rfc4180)-conforming CSV reading library for .NET. Written in C#. -Supports files encoded in either UTF-8 or single-byte Windows codepages. \ No newline at end of file +Fully supports all UTF-8 encoded byte streams. +- Other encodings will work as well, as long as the bytes `0x0A`, `0x0D`, `0x22`, and `0x2C` are all guaranteed to mean the same thing that they mean in ASCII / UTF-8, and as long as the encoding defines no other byte sequences which identify the Unicode code points for `'\n'`, `'\r'`, `'"'`, or `','`, respectively. +- In practice, this means that most "Extended ASCII" code pages will probably work, probably including all SBCS. Many "Extended ASCII" DBCS will probably work too, but it looks like Shift-JIS will *not* work. +- Notably, this library will fail to yield the correct result when used with byte streams encoded in any variant of UTF-16 or UTF-32, even with a BOM header. If you require that support, there are other libraries that should work for you. + +Fully supports all streams that completely conform to the RFC 4180 format, and defines rules for how to handle streams that break certain rules of RFC 4180 in a way that seems to be consistent with other popular tools, at a minor speed penalty. From 9c5e6b4f6dcf5def1b5ab9cf4e783ad4728e53a7 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Thu, 23 May 2019 09:03:27 -0400 Subject: [PATCH 08/22] Update README.md --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index f8d513b..586cc28 100644 --- a/README.md +++ b/README.md @@ -7,3 +7,24 @@ Fully supports all UTF-8 encoded byte streams. - Notably, this library will fail to yield the correct result when used with byte streams encoded in any variant of UTF-16 or UTF-32, even with a BOM header. If you require that support, there are other libraries that should work for you. Fully supports all streams that completely conform to the RFC 4180 format, and defines rules for how to handle streams that break certain rules of RFC 4180 in a way that seems to be consistent with other popular tools, at a minor speed penalty. + +This library exists because the original developer was unsatisfied with the performance characteristics of raw CSV processing tools. Everything out there seemed to have some combination of these flaws: +1. Tons of managed heap allocations on hot paths, often baked into the API requirements +1. Decoding to UTF-16LE **before** scanning for critical bytes, which could be considered a subset of: +1. The design forces a ton of processing to happen on the input which the caller might not even care about +1. Omitting important parts of RFC 4180 +1. Disappointing options for mitigating DDoS risk + +"RFC 4180 over UTF-8" is a very simple byte stream format, and the state machine requires only a few extra states to define how to handle all UTF-8 streams that are non-RFC 4180, so it seemed odd that there wasn't a reader without these flaws. + +With Cursively, +1. each stream only strictly requires a grand total of two objects to be allocated on the managed heap*, + - *in case this is too much, both could be reset and put into a pool to be reused for processing other streams +1. processing happens directly on the input bytes (no decoding is done by Cursively itself), +1. the only processing that Cursively necessarily does is the bare minimum needed to describe the data to the caller, +1. inputs that conform to RFC 4180* are processed according to all the rules of RFC 4180, and + - *inputs that do not conform to RFC 4180 are handled according to consistent, intuitive rules +1. there is a very low risk* of DDoS directly from using Cursively, and the caller has the tools that they need in order to prevent (or respond to) attacks in a more "natural" way than other CSV libraries that the developer has seen. + - *There is no such thing as "risk-free" in our world. Cursively itself cannot eliminate the risk of attacks that use it as a vector to exploit defects in CoreFX / C# compiler / runtime / OS / hardware. + +Future enhancements may add support for byte streams in other encodings if there's demand for it, but not at the expense of anything that matters to the "RFC 4180 over UTF-8" use case. From d5178b60ccd4b061abe98af577617abc1f24b7e4 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Thu, 23 May 2019 09:06:18 -0400 Subject: [PATCH 09/22] Finish this thought. Resolves #1 --- src/Cursively/CsvTokenizer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs index cd003ee..26ff55e 100644 --- a/src/Cursively/CsvTokenizer.cs +++ b/src/Cursively/CsvTokenizer.cs @@ -185,7 +185,7 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi { // most of the time, we should be able to fully process each field in the same // loop iteration that we first start reading it. the most prominent exception - // is that + // is when we encounter a quoted field. PickUpFromLastTime(ref chunk, visitor); continue; } From e446cdce44a720b29b92b6fb937fbc86c622cbf8 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Thu, 23 May 2019 09:07:56 -0400 Subject: [PATCH 10/22] Fix this comment. "cut buffer" refers to an old idea from before the word "visitor" showed up Resolves #2 --- src/Cursively/CsvTokenizer.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs index 26ff55e..016e33d 100644 --- a/src/Cursively/CsvTokenizer.cs +++ b/src/Cursively/CsvTokenizer.cs @@ -369,8 +369,8 @@ private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpan Date: Thu, 23 May 2019 09:37:56 -0400 Subject: [PATCH 11/22] batch script updates --- calculate-coverage.bat | 4 ++-- run-benchmark.bat | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 run-benchmark.bat diff --git a/calculate-coverage.bat b/calculate-coverage.bat index a194a40..f92cdef 100644 --- a/calculate-coverage.bat +++ b/calculate-coverage.bat @@ -2,8 +2,8 @@ pushd %~dp0 nuget install OpenCover -Version 4.7.922 -OutputDirectory OpenCover\tools nuget install ReportGenerator -Version 4.1.4 -OutputDirectory OpenCover\tools dotnet build -c Release -FOR /F "tokens=* USEBACKQ" %%D IN (`where dotnet`) DO ( -SET DotNetPath=%%D +for /F "tokens=* USEBACKQ" %%D in (`where dotnet`) do ( +set DotNetPath=%%D ) OpenCover\tools\OpenCover.4.7.922\tools\OpenCover.Console.exe ^ "-target:%DotNetPath%" ^ diff --git a/run-benchmark.bat b/run-benchmark.bat new file mode 100644 index 0000000..4b05088 --- /dev/null +++ b/run-benchmark.bat @@ -0,0 +1,4 @@ +pushd %~dp0 +dotnet run -c Release --project test\Cursively.Benchmark\Cursively.Benchmark.csproj --framework netcoreapp2.2 +for %%D in (BenchmarkDotNet.Artifacts\results\*.html) do %%D +popd From b6f30ff06f6250cde0c1f308a64a9074f1dbbeb1 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Thu, 23 May 2019 09:48:23 -0400 Subject: [PATCH 12/22] Fix comment --- src/Cursively/CsvTokenizer.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs index 016e33d..b92ea61 100644 --- a/src/Cursively/CsvTokenizer.cs +++ b/src/Cursively/CsvTokenizer.cs @@ -10,9 +10,9 @@ namespace Cursively /// /// The byte stream is tokenized according to the rules of the ASCII encoding. This makes it /// compatible with any encoding that encodes 0x0A, 0x0D, 0x22, and 0x2C the same way that ASCII - /// encodes them. Windows code pages and UTF-8 are notable examples of acceptable encodings. + /// encodes them. UTF-8 and Extended ASCII SBCS are notable examples of acceptable encodings. /// UTF-16 is a notable example of an unacceptable encoding; trying to use this class to process - /// text encoded in any other encoding will yield undesirable results without any errors. + /// text encoded in an unacceptable encoding will yield undesirable results without any errors. /// /// /// All bytes that appear in the stream except 0x0A, 0x0D, 0x22, and 0x2C are unconditionally From 5e4757348129181ff848c2032972bfde45a3dba3 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Thu, 23 May 2019 10:30:13 -0400 Subject: [PATCH 13/22] Add more benchmark files. --- .../large-csv-files/100-huge-records-quoted.csv | 3 +++ test/Cursively.Benchmark/large-csv-files/100-huge-records.csv | 3 +++ test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv | 3 +++ 3 files changed, 9 insertions(+) create mode 100644 test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv create mode 100644 test/Cursively.Benchmark/large-csv-files/100-huge-records.csv create mode 100644 test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv diff --git a/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv b/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv new file mode 100644 index 0000000..718947c --- /dev/null +++ b/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:611a7ba4f69bf3ab34f1fbf3fbf4711bfa8fb91a210683bdf4c1915818f1cfe0 +size 4900444 diff --git a/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv b/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv new file mode 100644 index 0000000..fde3ed5 --- /dev/null +++ b/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e82c977d84c24a6b16063b634cbeab1e8409b34724b0ecf07893f45f8aadb53 +size 2900444 diff --git a/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv b/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv new file mode 100644 index 0000000..61dd063 --- /dev/null +++ b/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f1e211bf4eb14ab578ccf6aff141e8db41e80314b39b85fba5f047830f746e4 +size 10020000 From b9a5a50aefffa049cd59f1661ce99a0dda9b464c Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sun, 26 May 2019 09:11:51 -0400 Subject: [PATCH 14/22] Renames --- calculate-coverage.bat => calculate-coverage.cmd | 0 run-benchmark.bat => run-benchmark.cmd | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename calculate-coverage.bat => calculate-coverage.cmd (100%) rename run-benchmark.bat => run-benchmark.cmd (100%) diff --git a/calculate-coverage.bat b/calculate-coverage.cmd similarity index 100% rename from calculate-coverage.bat rename to calculate-coverage.cmd diff --git a/run-benchmark.bat b/run-benchmark.cmd similarity index 100% rename from run-benchmark.bat rename to run-benchmark.cmd From 20f67692b78282123e6b594ef68e4a015bb80d83 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sun, 26 May 2019 09:12:26 -0400 Subject: [PATCH 15/22] Tweak GitVersionTask properties --- src/Directory.Build.props | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Directory.Build.props b/src/Directory.Build.props index 3ef70bd..a83c324 100644 --- a/src/Directory.Build.props +++ b/src/Directory.Build.props @@ -13,6 +13,12 @@ true true snupkg + + + false + + + false From b2acfaa975c2730d4ace57be04a325bf7f6e8df5 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sun, 26 May 2019 09:45:37 -0400 Subject: [PATCH 16/22] Add stuff to generate docs. --- calculate-coverage.cmd | 16 +++++++-------- doc/articles/.gitkeep | 0 doc/docfx.json | 46 ++++++++++++++++++++++++++++++++++++++++++ doc/images/.gitkeep | 0 doc/index.md | 30 +++++++++++++++++++++++++++ doc/overwrite/.gitkeep | 0 doc/release-notes.md | 4 ++++ doc/toc.yml | 10 +++++++++ generate-docs.cmd | 17 ++++++++++++++++ push-docs.cmd | 20 ++++++++++++++++++ tools/.gitignore | 1 + 11 files changed, 136 insertions(+), 8 deletions(-) create mode 100644 doc/articles/.gitkeep create mode 100644 doc/docfx.json create mode 100644 doc/images/.gitkeep create mode 100644 doc/index.md create mode 100644 doc/overwrite/.gitkeep create mode 100644 doc/release-notes.md create mode 100644 doc/toc.yml create mode 100644 generate-docs.cmd create mode 100644 push-docs.cmd create mode 100644 tools/.gitignore diff --git a/calculate-coverage.cmd b/calculate-coverage.cmd index f92cdef..7376195 100644 --- a/calculate-coverage.cmd +++ b/calculate-coverage.cmd @@ -1,24 +1,24 @@ pushd %~dp0 -nuget install OpenCover -Version 4.7.922 -OutputDirectory OpenCover\tools -nuget install ReportGenerator -Version 4.1.4 -OutputDirectory OpenCover\tools +nuget install OpenCover -Version 4.7.922 -OutputDirectory tools +nuget install ReportGenerator -Version 4.1.4 -OutputDirectory tools dotnet build -c Release for /F "tokens=* USEBACKQ" %%D in (`where dotnet`) do ( set DotNetPath=%%D ) -OpenCover\tools\OpenCover.4.7.922\tools\OpenCover.Console.exe ^ +tools\OpenCover.4.7.922\tools\OpenCover.Console.exe ^ "-target:%DotNetPath%" ^ "-targetArgs:test -c Release --no-build" ^ "-filter:+[Cursively]* +[Cursively.*]* -[Cursively.Tests]*" ^ - -output:OpenCover\raw-coverage-results.xml ^ + -output:tools\raw-coverage-results.xml ^ -register:user ^ -oldstyle dotnet clean -c Release -dotnet OpenCover\tools\ReportGenerator.4.1.4\tools\netcoreapp2.1\ReportGenerator.dll ^ - -reports:OpenCover\raw-coverage-results.xml ^ - -targetdir:OpenCover\results +dotnet tools\ReportGenerator.4.1.4\tools\netcoreapp2.1\ReportGenerator.dll ^ + -reports:tools\raw-coverage-results.xml ^ + -targetdir:tools\results -OpenCover\results\index.htm +tools\results\index.htm popd diff --git a/doc/articles/.gitkeep b/doc/articles/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/doc/docfx.json b/doc/docfx.json new file mode 100644 index 0000000..c933dc2 --- /dev/null +++ b/doc/docfx.json @@ -0,0 +1,46 @@ +{ + "metadata": [ + { + "src": [ + { + "files": [ "**.csproj" ], + "src": "../src" + } + ], + "dest": "obj/api" + } + ], + "build": { + "content": [ + { + "files": [ "**/*.yml" ], + "src": "obj/api", + "dest": "api" + }, + { + "files": ["articles/**.md", "articles/**/toc.yml", "toc.yml", "*.md" ] + } + ], + "resource": [ + { + "files": [ "images/**" ] + } + ], + "overwrite": "overwrite/**.md", + "dest": "obj/generated-site-content", + "globalMetadataFiles": [], + "fileMetadataFiles": [], + "template": [ "default" ], + "postProcessors": [], + "markdownEngineName": "markdig", + "xrefService": [ "https://xref.docs.microsoft.com/query?uid={uid}" ], + "globalMetadata": { + "_appTitle": "Cursively", + "_enableSearch": true, + "_gitContribute": { + "apiSpecFolder": "doc/overwrite" + } + }, + "maxParallelism": 1 + } +} diff --git a/doc/images/.gitkeep b/doc/images/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/doc/index.md b/doc/index.md new file mode 100644 index 0000000..586cc28 --- /dev/null +++ b/doc/index.md @@ -0,0 +1,30 @@ +# Cursively +A fast, [RFC 4180](https://tools.ietf.org/html/rfc4180)-conforming CSV reading library for .NET. Written in C#. + +Fully supports all UTF-8 encoded byte streams. +- Other encodings will work as well, as long as the bytes `0x0A`, `0x0D`, `0x22`, and `0x2C` are all guaranteed to mean the same thing that they mean in ASCII / UTF-8, and as long as the encoding defines no other byte sequences which identify the Unicode code points for `'\n'`, `'\r'`, `'"'`, or `','`, respectively. +- In practice, this means that most "Extended ASCII" code pages will probably work, probably including all SBCS. Many "Extended ASCII" DBCS will probably work too, but it looks like Shift-JIS will *not* work. +- Notably, this library will fail to yield the correct result when used with byte streams encoded in any variant of UTF-16 or UTF-32, even with a BOM header. If you require that support, there are other libraries that should work for you. + +Fully supports all streams that completely conform to the RFC 4180 format, and defines rules for how to handle streams that break certain rules of RFC 4180 in a way that seems to be consistent with other popular tools, at a minor speed penalty. + +This library exists because the original developer was unsatisfied with the performance characteristics of raw CSV processing tools. Everything out there seemed to have some combination of these flaws: +1. Tons of managed heap allocations on hot paths, often baked into the API requirements +1. Decoding to UTF-16LE **before** scanning for critical bytes, which could be considered a subset of: +1. The design forces a ton of processing to happen on the input which the caller might not even care about +1. Omitting important parts of RFC 4180 +1. Disappointing options for mitigating DDoS risk + +"RFC 4180 over UTF-8" is a very simple byte stream format, and the state machine requires only a few extra states to define how to handle all UTF-8 streams that are non-RFC 4180, so it seemed odd that there wasn't a reader without these flaws. + +With Cursively, +1. each stream only strictly requires a grand total of two objects to be allocated on the managed heap*, + - *in case this is too much, both could be reset and put into a pool to be reused for processing other streams +1. processing happens directly on the input bytes (no decoding is done by Cursively itself), +1. the only processing that Cursively necessarily does is the bare minimum needed to describe the data to the caller, +1. inputs that conform to RFC 4180* are processed according to all the rules of RFC 4180, and + - *inputs that do not conform to RFC 4180 are handled according to consistent, intuitive rules +1. there is a very low risk* of DDoS directly from using Cursively, and the caller has the tools that they need in order to prevent (or respond to) attacks in a more "natural" way than other CSV libraries that the developer has seen. + - *There is no such thing as "risk-free" in our world. Cursively itself cannot eliminate the risk of attacks that use it as a vector to exploit defects in CoreFX / C# compiler / runtime / OS / hardware. + +Future enhancements may add support for byte streams in other encodings if there's demand for it, but not at the expense of anything that matters to the "RFC 4180 over UTF-8" use case. diff --git a/doc/overwrite/.gitkeep b/doc/overwrite/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/doc/release-notes.md b/doc/release-notes.md new file mode 100644 index 0000000..8c1dc2d --- /dev/null +++ b/doc/release-notes.md @@ -0,0 +1,4 @@ +# Cursively Release Notes + +## 1.0.0 +- Initial release. diff --git a/doc/toc.yml b/doc/toc.yml new file mode 100644 index 0000000..aa4ba7c --- /dev/null +++ b/doc/toc.yml @@ -0,0 +1,10 @@ +- name: Home + href: index.md +- name: API Documentation + href: obj/api/ +- name: Release Notes + href: release-notes.md +- name: NuGet Package + href: https://www.nuget.org/packages/Cursively +- name: GitHub + href: https://github.com/airbreather/Cursively diff --git a/generate-docs.cmd b/generate-docs.cmd new file mode 100644 index 0000000..b4f9dff --- /dev/null +++ b/generate-docs.cmd @@ -0,0 +1,17 @@ +@echo off +REM =========================================================================== +REM Regenerates the https://airbreather.github.io/Cursively content locally +REM =========================================================================== +set DOCFX_PACKAGE_VERSION=2.42.4 +pushd %~dp0 +REM incremental / cached builds tweak things about the output, so let's do it +REM all fresh if we can help it... +rd /s /q src\Cursively\obj +rd /s /q doc\obj +dotnet restore +pushd tools +rd /s /q docfx.console.%DOCFX_PACKAGE_VERSION% +nuget install docfx.console -Version %DOCFX_PACKAGE_VERSION% +popd +%~dp0\tools\docfx.console.%DOCFX_PACKAGE_VERSION%\tools\docfx doc\docfx.json +popd diff --git a/push-docs.cmd b/push-docs.cmd new file mode 100644 index 0000000..12188f6 --- /dev/null +++ b/push-docs.cmd @@ -0,0 +1,20 @@ +@echo off +REM =========================================================================== +REM Pushes the https://airbreather.github.io/Cursively content to the server +REM (run generate-docs.cmd first) +REM =========================================================================== +pushd %~dp0 +pushd doc\obj +rd /s /q gh-pages +git clone --branch gh-pages %~dp0 gh-pages +pushd gh-pages +git rm -r . +xcopy /Q /E /R /Y ..\generated-site-content . +git add . +git commit -m "Update docs. This was performed automatically." +git push origin gh-pages +popd +rd /s /q gh-pages +popd +git push origin gh-pages +popd diff --git a/tools/.gitignore b/tools/.gitignore new file mode 100644 index 0000000..f59ec20 --- /dev/null +++ b/tools/.gitignore @@ -0,0 +1 @@ +* \ No newline at end of file From ea4af02380034ed8e3e661f31aa0fae697f45334 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sun, 26 May 2019 09:51:39 -0400 Subject: [PATCH 17/22] Try again on this --- src/Cursively/CsvReaderVisitorBase.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Cursively/CsvReaderVisitorBase.cs b/src/Cursively/CsvReaderVisitorBase.cs index 1b804ab..5b0eea6 100644 --- a/src/Cursively/CsvReaderVisitorBase.cs +++ b/src/Cursively/CsvReaderVisitorBase.cs @@ -38,16 +38,22 @@ public abstract class CsvReaderVisitorBase /// /// /// + /// /// Field is split across multiple read buffer chunks, or else it runs up to the very end of /// a read buffer chunk, but we can't prove it without the first byte of the next chunk or a /// call. + /// /// /// + /// /// Quoted field contains a literal quote that was escaped in the original stream, and so we /// cannot yield the entire field data as-is. + /// /// /// + /// /// Stream does not conform to RFC 4180, and optimizing such streams to avoid this case. + /// /// /// /// From a8ac64bdd29f4e8966cb4cf4916a9e615f6daade Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sun, 26 May 2019 09:55:34 -0400 Subject: [PATCH 18/22] Fix another bad --- src/Cursively/CsvTokenizer.cs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs index b92ea61..8dd039e 100644 --- a/src/Cursively/CsvTokenizer.cs +++ b/src/Cursively/CsvTokenizer.cs @@ -38,38 +38,52 @@ namespace Cursively /// /// /// + /// /// The spec says that separate lines are delimited by CRLF line breaks. This implementation /// accepts line breaks of any format (CRLF, LF, CR). + /// /// /// + /// /// The spec says that there may or may not be a line break at the end of the last record in the /// stream. This implementation does not require there to be a line break, and it would not /// hurt to add one either. + /// /// /// + /// /// The spec refers to an optional header line at the beginning. This implementation does not /// include any special treatment for the first line of fields; if they need to be treated as /// headers, then the consumer needs to know that and respond accordingly. + /// /// /// + /// /// The spec says each record may contain "one or more fields". This implementation interprets /// that to mean strictly that any number of consecutive newline characters in a row are treated /// as one. + /// /// /// + /// /// Many implementations allow the delimiter character to be configured to be something else /// other than a comma. This implementation does not currently offer that flexibility. + /// /// /// + /// /// Many implementations allow automatically trimming whitespace at the beginning and/or end of /// each field (sometimes optionally). The spec expressly advises against doing that, and this /// implementation follows suit. It is our opinion that consumers ought to be more than capable /// of trimming spaces at the beginning or end as part of their processing if this is desired. + /// /// /// + /// /// The spec says that the last field in a record must not be followed by a comma. This /// implementation interprets that to mean that if we do see a comma followed immediately by a /// line ending character, then that represents the data for an empty field. + /// /// /// /// From dbc873bd98e362bd7cca7210971d38bd8a61b334 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sun, 26 May 2019 10:55:45 -0400 Subject: [PATCH 19/22] Put usage info in README --- README.md | 109 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 82 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 586cc28..cfd7a6d 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,85 @@ # Cursively A fast, [RFC 4180](https://tools.ietf.org/html/rfc4180)-conforming CSV reading library for .NET. Written in C#. -Fully supports all UTF-8 encoded byte streams. -- Other encodings will work as well, as long as the bytes `0x0A`, `0x0D`, `0x22`, and `0x2C` are all guaranteed to mean the same thing that they mean in ASCII / UTF-8, and as long as the encoding defines no other byte sequences which identify the Unicode code points for `'\n'`, `'\r'`, `'"'`, or `','`, respectively. -- In practice, this means that most "Extended ASCII" code pages will probably work, probably including all SBCS. Many "Extended ASCII" DBCS will probably work too, but it looks like Shift-JIS will *not* work. -- Notably, this library will fail to yield the correct result when used with byte streams encoded in any variant of UTF-16 or UTF-32, even with a BOM header. If you require that support, there are other libraries that should work for you. - -Fully supports all streams that completely conform to the RFC 4180 format, and defines rules for how to handle streams that break certain rules of RFC 4180 in a way that seems to be consistent with other popular tools, at a minor speed penalty. - -This library exists because the original developer was unsatisfied with the performance characteristics of raw CSV processing tools. Everything out there seemed to have some combination of these flaws: -1. Tons of managed heap allocations on hot paths, often baked into the API requirements -1. Decoding to UTF-16LE **before** scanning for critical bytes, which could be considered a subset of: -1. The design forces a ton of processing to happen on the input which the caller might not even care about -1. Omitting important parts of RFC 4180 -1. Disappointing options for mitigating DDoS risk - -"RFC 4180 over UTF-8" is a very simple byte stream format, and the state machine requires only a few extra states to define how to handle all UTF-8 streams that are non-RFC 4180, so it seemed odd that there wasn't a reader without these flaws. - -With Cursively, -1. each stream only strictly requires a grand total of two objects to be allocated on the managed heap*, - - *in case this is too much, both could be reset and put into a pool to be reused for processing other streams -1. processing happens directly on the input bytes (no decoding is done by Cursively itself), -1. the only processing that Cursively necessarily does is the bare minimum needed to describe the data to the caller, -1. inputs that conform to RFC 4180* are processed according to all the rules of RFC 4180, and - - *inputs that do not conform to RFC 4180 are handled according to consistent, intuitive rules -1. there is a very low risk* of DDoS directly from using Cursively, and the caller has the tools that they need in order to prevent (or respond to) attacks in a more "natural" way than other CSV libraries that the developer has seen. - - *There is no such thing as "risk-free" in our world. Cursively itself cannot eliminate the risk of attacks that use it as a vector to exploit defects in CoreFX / C# compiler / runtime / OS / hardware. - -Future enhancements may add support for byte streams in other encodings if there's demand for it, but not at the expense of anything that matters to the "RFC 4180 over UTF-8" use case. +## Usage +1. Create a subclass of `CsvReaderVisitorBase` with your own logic. +1. To read a CSV file: + - Create a new instance of your visitor. + - Create a new instance of `CsvTokenizer`. + - Call `CsvTokenizer.ProcessNextChunk` for each chunk of the file. + - Call `CsvTokenizer.ProcessEndOfStream` after the last chunk of the file. + +## Example +This demonstrates using Cursively to asynchronously write the details of a particular UTF-8 encoded file to the console. + +```csharp +public static void ProcessCsvFile(string csvFilePath) +{ + var myVisitor = new MyVisitor(maxFieldLength: 1000); + var tokenizer = new CsvTokenizer(); + using (var file = File.OpenRead(csvFilePath)) + { + Console.WriteLine($"Started reading '{csvFilePath}'."); + Span fileReadBuffer = new byte[4096]; + while (true) + { + int count = file.Read(fileReadBuffer); + if (count == 0) + { + break; + } + + var chunk = fileReadBuffer.Slice(0, count); + tokenizer.ProcessNextChunk(chunk, myVisitor); + } + + tokenizer.ProcessEndOfStream(myVisitor); + } + + Console.WriteLine($"Finished reading '{csvFilePath}'."); +} + +public sealed class MyVisitor : CsvReaderVisitorBase +{ + private readonly Decoder _utf8Decoder = Encoding.UTF8.GetDecoder(); + + private readonly char[] _buffer; + + private int _bufferConsumed; + + public MyVisitor(int maxFieldLength) => + _buffer = new char[maxFieldLength]; + + public override void VisitPartialFieldContents(ReadOnlySpan chunk) => + VisitFieldContents(chunk, flush: false); + + public override void VisitEndOfField(ReadOnlySpan chunk) => + VisitFieldContents(chunk, flush: true); + + public override void VisitEndOfRecord() => + Console.WriteLine("End of fields for this record."); + + private void VisitFieldContents(ReadOnlySpan chunk, bool flush) + { + int charCount = _utf8Decoder.GetCharCount(chunk, flush); + if (charCount + _bufferConsumed < _buffer.Length) + { + _utf8Decoder.GetChars(chunk, new Span(_buffer, _bufferConsumed, charCount), flush); + _bufferConsumed += charCount; + } + else + { + throw new InvalidDataException($"Field is longer than {_buffer.Length} characters."); + } + + if (!flush) + { + return; + } + + Console.Write("Field: "); + Console.WriteLine(_buffer, 0, _bufferConsumed); + _bufferConsumed = 0; + } +} +``` From b677609f2b56ad34b2955b213e5f5f2e3d54d38f Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sun, 26 May 2019 10:56:31 -0400 Subject: [PATCH 20/22] It's not async anymore --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cfd7a6d..3f93c38 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ A fast, [RFC 4180](https://tools.ietf.org/html/rfc4180)-conforming CSV reading l - Call `CsvTokenizer.ProcessEndOfStream` after the last chunk of the file. ## Example -This demonstrates using Cursively to asynchronously write the details of a particular UTF-8 encoded file to the console. +This demonstrates using Cursively to write the details of a particular UTF-8 encoded file to the console. ```csharp public static void ProcessCsvFile(string csvFilePath) From 483b53236ab3d519fc99b87b0b5969c95f52c426 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sun, 26 May 2019 11:01:44 -0400 Subject: [PATCH 21/22] Add doc link and badges --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 3f93c38..7868df7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,14 @@ # Cursively A fast, [RFC 4180](https://tools.ietf.org/html/rfc4180)-conforming CSV reading library for .NET. Written in C#. +## Documentation +Documentation is currently being published as [GitHub Pages](https://airbreather.github.io/Cursively/index.html). + +## Project status +| License | CI (AppVeyor) | NuGet | MyGet (pre-release) | +| ------- | ------------- | ----- | ------------------- | +| [![License](https://img.shields.io/github/license/airbreather/Cursively.svg)](https://github.com/airbreather/Cursively/blob/develop/LICENSE.md) | [![CI](https://ci.appveyor.com/api/projects/status/aqr1kmj9qqfx6ple?svg=true)](https://ci.appveyor.com/project/airbreather/Cursively) | [![NuGet](https://img.shields.io/nuget/v/Cursively.svg)](https://www.nuget.org/packages/Cursively/) | [![MyGet](https://img.shields.io/myget/airbreather/vpre/Cursively.svg?style=flat)](https://myget.org/feed/airbreather/package/nuget/Cursively) | + ## Usage 1. Create a subclass of `CsvReaderVisitorBase` with your own logic. 1. To read a CSV file: From 41326770e4701abf49e1e65aa6e2728a417d2fc5 Mon Sep 17 00:00:00 2001 From: Joe Amenta Date: Sun, 26 May 2019 11:02:15 -0400 Subject: [PATCH 22/22] probably looks better up above --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7868df7..ba574db 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,13 @@ # Cursively A fast, [RFC 4180](https://tools.ietf.org/html/rfc4180)-conforming CSV reading library for .NET. Written in C#. -## Documentation -Documentation is currently being published as [GitHub Pages](https://airbreather.github.io/Cursively/index.html). - -## Project status | License | CI (AppVeyor) | NuGet | MyGet (pre-release) | | ------- | ------------- | ----- | ------------------- | | [![License](https://img.shields.io/github/license/airbreather/Cursively.svg)](https://github.com/airbreather/Cursively/blob/develop/LICENSE.md) | [![CI](https://ci.appveyor.com/api/projects/status/aqr1kmj9qqfx6ple?svg=true)](https://ci.appveyor.com/project/airbreather/Cursively) | [![NuGet](https://img.shields.io/nuget/v/Cursively.svg)](https://www.nuget.org/packages/Cursively/) | [![MyGet](https://img.shields.io/myget/airbreather/vpre/Cursively.svg?style=flat)](https://myget.org/feed/airbreather/package/nuget/Cursively) | +## Documentation +Documentation is currently being published as [GitHub Pages](https://airbreather.github.io/Cursively/index.html). + ## Usage 1. Create a subclass of `CsvReaderVisitorBase` with your own logic. 1. To read a CSV file: