diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..8719751 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,196 @@ +# Remove the line below if you want to inherit .editorconfig settings from higher directories +root = true + +# C# files +[*.cs] + +#### Core EditorConfig Options #### + +# Indentation and spacing +indent_size = 4 +indent_style = space +tab_width = 4 + +# New line preferences +end_of_line = crlf +insert_final_newline = true + +#### .NET Coding Conventions #### + +# Organize usings +dotnet_separate_import_directive_groups = true +dotnet_sort_system_directives_first = true + +# this. and Me. preferences +dotnet_style_qualification_for_event = false:silent +dotnet_style_qualification_for_field = false:silent +dotnet_style_qualification_for_method = false:silent +dotnet_style_qualification_for_property = false:silent + +# Language keywords vs BCL types preferences +dotnet_style_predefined_type_for_locals_parameters_members = true:warning +dotnet_style_predefined_type_for_member_access = true:warning + +# Parentheses preferences +dotnet_style_parentheses_in_arithmetic_binary_operators = always_for_clarity:silent +dotnet_style_parentheses_in_other_binary_operators = always_for_clarity:silent +dotnet_style_parentheses_in_other_operators = never_if_unnecessary:silent +dotnet_style_parentheses_in_relational_binary_operators = always_for_clarity:silent + +# Modifier preferences +dotnet_style_require_accessibility_modifiers = for_non_interface_members:silent + +# Expression-level preferences +csharp_style_deconstructed_variable_declaration = true:suggestion +csharp_style_inlined_variable_declaration = true:silent +csharp_style_throw_expression = true:warning +dotnet_style_coalesce_expression = true:error +dotnet_style_collection_initializer = true:warning +dotnet_style_explicit_tuple_names = true:error +dotnet_style_null_propagation = true:error +dotnet_style_object_initializer = true:silent +dotnet_style_prefer_auto_properties = true:silent +dotnet_style_prefer_compound_assignment = true:suggestion +dotnet_style_prefer_conditional_expression_over_assignment = true:silent +dotnet_style_prefer_conditional_expression_over_return = true:silent +dotnet_style_prefer_inferred_anonymous_type_member_names = true:suggestion +dotnet_style_prefer_inferred_tuple_names = true:suggestion +dotnet_style_prefer_is_null_check_over_reference_equality_method = true:suggestion + +# Field preferences +dotnet_style_readonly_field = true:suggestion + +# Parameter preferences +dotnet_code_quality_unused_parameters = all:suggestion + +#### C# Coding Conventions #### + +# var preferences +csharp_style_var_elsewhere = true:silent +csharp_style_var_for_built_in_types = false:silent +csharp_style_var_when_type_is_apparent = true:silent + +# Expression-bodied members +csharp_style_expression_bodied_accessors = true:suggestion +csharp_style_expression_bodied_constructors = true:suggestion +csharp_style_expression_bodied_indexers = true:suggestion +csharp_style_expression_bodied_lambdas = true:silent +csharp_style_expression_bodied_local_functions = false:silent +csharp_style_expression_bodied_methods = true:suggestion +csharp_style_expression_bodied_operators = true:suggestion +csharp_style_expression_bodied_properties = true:suggestion + +# Pattern matching preferences +csharp_style_pattern_matching_over_as_with_null_check = true:warning +csharp_style_pattern_matching_over_is_with_cast_check = true:warning + +# Null-checking preferences +csharp_style_conditional_delegate_call = true:silent + +# Modifier preferences +csharp_prefer_static_local_function = true:suggestion +csharp_preferred_modifier_order = public,private,protected,internal,static,extern,new,virtual,abstract,sealed,override,readonly,unsafe,volatile,async + +# Code-block preferences +csharp_prefer_braces = true:error +csharp_prefer_simple_using_statement = true:suggestion + +# Expression-level preferences +csharp_prefer_simple_default_expression = true:error +csharp_style_pattern_local_over_anonymous_function = true:suggestion +csharp_style_prefer_index_operator = true:suggestion +csharp_style_prefer_range_operator = true:suggestion +csharp_style_unused_value_assignment_preference = discard_variable:suggestion +csharp_style_unused_value_expression_statement_preference = discard_variable:silent + +# 'using' directive preferences +csharp_using_directive_placement = outside_namespace:silent + +#### C# Formatting Rules #### + +# New line preferences +csharp_new_line_before_catch = true +csharp_new_line_before_else = true +csharp_new_line_before_finally = true +csharp_new_line_before_members_in_anonymous_types = true +csharp_new_line_before_members_in_object_initializers = true +csharp_new_line_before_open_brace = all +csharp_new_line_between_query_expression_clauses = true + +# Indentation preferences +csharp_indent_block_contents = true +csharp_indent_braces = false +csharp_indent_case_contents = true +csharp_indent_case_contents_when_block = true +csharp_indent_labels = one_less_than_current +csharp_indent_switch_labels = true + +# Space preferences +csharp_space_after_cast = false +csharp_space_after_colon_in_inheritance_clause = true +csharp_space_after_comma = true +csharp_space_after_dot = false +csharp_space_after_keywords_in_control_flow_statements = true +csharp_space_after_semicolon_in_for_statement = true +csharp_space_around_binary_operators = before_and_after +csharp_space_around_declaration_statements = false +csharp_space_before_colon_in_inheritance_clause = true +csharp_space_before_comma = false +csharp_space_before_dot = false +csharp_space_before_open_square_brackets = false +csharp_space_before_semicolon_in_for_statement = false +csharp_space_between_empty_square_brackets = false +csharp_space_between_method_call_empty_parameter_list_parentheses = false +csharp_space_between_method_call_name_and_opening_parenthesis = false +csharp_space_between_method_call_parameter_list_parentheses = false +csharp_space_between_method_declaration_empty_parameter_list_parentheses = false +csharp_space_between_method_declaration_name_and_open_parenthesis = false +csharp_space_between_method_declaration_parameter_list_parentheses = false +csharp_space_between_parentheses = false +csharp_space_between_square_brackets = false + +# Wrapping preferences +csharp_preserve_single_line_blocks = true +csharp_preserve_single_line_statements = true + +#### Naming styles #### + +# Naming rules + +dotnet_naming_rule.interface_should_be_begins_with_i.severity = suggestion +dotnet_naming_rule.interface_should_be_begins_with_i.symbols = interface +dotnet_naming_rule.interface_should_be_begins_with_i.style = begins_with_i + +dotnet_naming_rule.types_should_be_pascal_case.severity = suggestion +dotnet_naming_rule.types_should_be_pascal_case.symbols = types +dotnet_naming_rule.types_should_be_pascal_case.style = pascal_case + +dotnet_naming_rule.non_field_members_should_be_pascal_case.severity = suggestion +dotnet_naming_rule.non_field_members_should_be_pascal_case.symbols = non_field_members +dotnet_naming_rule.non_field_members_should_be_pascal_case.style = pascal_case + +# Symbol specifications + +dotnet_naming_symbols.interface.applicable_kinds = interface +dotnet_naming_symbols.interface.applicable_accessibilities = public, internal, private, protected, protected_internal +dotnet_naming_symbols.interface.required_modifiers = + +dotnet_naming_symbols.types.applicable_kinds = class, struct, interface, enum +dotnet_naming_symbols.types.applicable_accessibilities = public, internal, private, protected, protected_internal +dotnet_naming_symbols.types.required_modifiers = + +dotnet_naming_symbols.non_field_members.applicable_kinds = property, event, method +dotnet_naming_symbols.non_field_members.applicable_accessibilities = public, internal, private, protected, protected_internal +dotnet_naming_symbols.non_field_members.required_modifiers = + +# Naming styles + +dotnet_naming_style.pascal_case.required_prefix = +dotnet_naming_style.pascal_case.required_suffix = +dotnet_naming_style.pascal_case.word_separator = +dotnet_naming_style.pascal_case.capitalization = pascal_case + +dotnet_naming_style.begins_with_i.required_prefix = I +dotnet_naming_style.begins_with_i.required_suffix = +dotnet_naming_style.begins_with_i.word_separator = +dotnet_naming_style.begins_with_i.capitalization = pascal_case diff --git a/Cursively.sln b/Cursively.sln index 4c11141..db29ccb 100644 --- a/Cursively.sln +++ b/Cursively.sln @@ -5,10 +5,10 @@ VisualStudioVersion = 16.0.28803.452 MinimumVisualStudioVersion = 15.0.26124.0 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cursively", "src\Cursively\Cursively.csproj", "{C67FC045-A9A8-4A97-B6AA-72503EEEA9FB}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cursively.Benchmark", "src\Cursively.Benchmark\Cursively.Benchmark.csproj", "{B97EC4E4-F878-4595-BD80-7B30CBAAF986}" -EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cursively.Tests", "test\Cursively.Tests\Cursively.Tests.csproj", "{029F8A31-11BF-490B-9413-AD77D1B5FDAB}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cursively.Benchmark", "test\Cursively.Benchmark\Cursively.Benchmark.csproj", "{D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -31,18 +31,6 @@ Global {C67FC045-A9A8-4A97-B6AA-72503EEEA9FB}.Release|x64.Build.0 = Release|Any CPU {C67FC045-A9A8-4A97-B6AA-72503EEEA9FB}.Release|x86.ActiveCfg = Release|Any CPU {C67FC045-A9A8-4A97-B6AA-72503EEEA9FB}.Release|x86.Build.0 = Release|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Debug|Any CPU.Build.0 = Debug|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Debug|x64.ActiveCfg = Debug|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Debug|x64.Build.0 = Debug|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Debug|x86.ActiveCfg = Debug|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Debug|x86.Build.0 = Debug|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Release|Any CPU.ActiveCfg = Release|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Release|Any CPU.Build.0 = Release|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Release|x64.ActiveCfg = Release|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Release|x64.Build.0 = Release|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Release|x86.ActiveCfg = Release|Any CPU - {B97EC4E4-F878-4595-BD80-7B30CBAAF986}.Release|x86.Build.0 = Release|Any CPU {029F8A31-11BF-490B-9413-AD77D1B5FDAB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {029F8A31-11BF-490B-9413-AD77D1B5FDAB}.Debug|Any CPU.Build.0 = Debug|Any CPU {029F8A31-11BF-490B-9413-AD77D1B5FDAB}.Debug|x64.ActiveCfg = Debug|Any CPU @@ -55,6 +43,18 @@ Global {029F8A31-11BF-490B-9413-AD77D1B5FDAB}.Release|x64.Build.0 = Release|Any CPU {029F8A31-11BF-490B-9413-AD77D1B5FDAB}.Release|x86.ActiveCfg = Release|Any CPU {029F8A31-11BF-490B-9413-AD77D1B5FDAB}.Release|x86.Build.0 = Release|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Debug|x64.ActiveCfg = Debug|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Debug|x64.Build.0 = Debug|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Debug|x86.ActiveCfg = Debug|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Debug|x86.Build.0 = Debug|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Release|Any CPU.Build.0 = Release|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Release|x64.ActiveCfg = Release|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Release|x64.Build.0 = Release|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Release|x86.ActiveCfg = Release|Any CPU + {D4755C6D-B7DB-44B7-AB1D-726CEBB6E5D0}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/Directory.Build.props b/Directory.Build.props index 3d31492..b6bbf76 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -4,10 +4,19 @@ $(MSBuildThisFileDirectory) - true + true true 7.3 + true + + true + + + + + + diff --git a/README.md b/README.md index e586d4c..ba574db 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,92 @@ # Cursively -A fast, RFC4180-compliant CSV reading library for .NET. Written in C#. +A fast, [RFC 4180](https://tools.ietf.org/html/rfc4180)-conforming CSV reading library for .NET. Written in C#. -Supports files encoded in either UTF-8 or single-byte Windows codepages. \ No newline at end of file +| License | CI (AppVeyor) | NuGet | MyGet (pre-release) | +| ------- | ------------- | ----- | ------------------- | +| [![License](https://img.shields.io/github/license/airbreather/Cursively.svg)](https://github.com/airbreather/Cursively/blob/develop/LICENSE.md) | [![CI](https://ci.appveyor.com/api/projects/status/aqr1kmj9qqfx6ple?svg=true)](https://ci.appveyor.com/project/airbreather/Cursively) | [![NuGet](https://img.shields.io/nuget/v/Cursively.svg)](https://www.nuget.org/packages/Cursively/) | [![MyGet](https://img.shields.io/myget/airbreather/vpre/Cursively.svg?style=flat)](https://myget.org/feed/airbreather/package/nuget/Cursively) | + +## Documentation +Documentation is currently being published as [GitHub Pages](https://airbreather.github.io/Cursively/index.html). + +## Usage +1. Create a subclass of `CsvReaderVisitorBase` with your own logic. +1. To read a CSV file: + - Create a new instance of your visitor. + - Create a new instance of `CsvTokenizer`. + - Call `CsvTokenizer.ProcessNextChunk` for each chunk of the file. + - Call `CsvTokenizer.ProcessEndOfStream` after the last chunk of the file. + +## Example +This demonstrates using Cursively to write the details of a particular UTF-8 encoded file to the console. + +```csharp +public static void ProcessCsvFile(string csvFilePath) +{ + var myVisitor = new MyVisitor(maxFieldLength: 1000); + var tokenizer = new CsvTokenizer(); + using (var file = File.OpenRead(csvFilePath)) + { + Console.WriteLine($"Started reading '{csvFilePath}'."); + Span fileReadBuffer = new byte[4096]; + while (true) + { + int count = file.Read(fileReadBuffer); + if (count == 0) + { + break; + } + + var chunk = fileReadBuffer.Slice(0, count); + tokenizer.ProcessNextChunk(chunk, myVisitor); + } + + tokenizer.ProcessEndOfStream(myVisitor); + } + + Console.WriteLine($"Finished reading '{csvFilePath}'."); +} + +public sealed class MyVisitor : CsvReaderVisitorBase +{ + private readonly Decoder _utf8Decoder = Encoding.UTF8.GetDecoder(); + + private readonly char[] _buffer; + + private int _bufferConsumed; + + public MyVisitor(int maxFieldLength) => + _buffer = new char[maxFieldLength]; + + public override void VisitPartialFieldContents(ReadOnlySpan chunk) => + VisitFieldContents(chunk, flush: false); + + public override void VisitEndOfField(ReadOnlySpan chunk) => + VisitFieldContents(chunk, flush: true); + + public override void VisitEndOfRecord() => + Console.WriteLine("End of fields for this record."); + + private void VisitFieldContents(ReadOnlySpan chunk, bool flush) + { + int charCount = _utf8Decoder.GetCharCount(chunk, flush); + if (charCount + _bufferConsumed < _buffer.Length) + { + _utf8Decoder.GetChars(chunk, new Span(_buffer, _bufferConsumed, charCount), flush); + _bufferConsumed += charCount; + } + else + { + throw new InvalidDataException($"Field is longer than {_buffer.Length} characters."); + } + + if (!flush) + { + return; + } + + Console.Write("Field: "); + Console.WriteLine(_buffer, 0, _bufferConsumed); + _bufferConsumed = 0; + } +} +``` diff --git a/calculate-coverage.bat b/calculate-coverage.bat deleted file mode 100644 index a194a40..0000000 --- a/calculate-coverage.bat +++ /dev/null @@ -1,24 +0,0 @@ -pushd %~dp0 -nuget install OpenCover -Version 4.7.922 -OutputDirectory OpenCover\tools -nuget install ReportGenerator -Version 4.1.4 -OutputDirectory OpenCover\tools -dotnet build -c Release -FOR /F "tokens=* USEBACKQ" %%D IN (`where dotnet`) DO ( -SET DotNetPath=%%D -) -OpenCover\tools\OpenCover.4.7.922\tools\OpenCover.Console.exe ^ - "-target:%DotNetPath%" ^ - "-targetArgs:test -c Release --no-build" ^ - "-filter:+[Cursively]* +[Cursively.*]* -[Cursively.Tests]*" ^ - -output:OpenCover\raw-coverage-results.xml ^ - -register:user ^ - -oldstyle - -dotnet clean -c Release - -dotnet OpenCover\tools\ReportGenerator.4.1.4\tools\netcoreapp2.1\ReportGenerator.dll ^ - -reports:OpenCover\raw-coverage-results.xml ^ - -targetdir:OpenCover\results - -OpenCover\results\index.htm - -popd diff --git a/calculate-coverage.cmd b/calculate-coverage.cmd new file mode 100644 index 0000000..7376195 --- /dev/null +++ b/calculate-coverage.cmd @@ -0,0 +1,24 @@ +pushd %~dp0 +nuget install OpenCover -Version 4.7.922 -OutputDirectory tools +nuget install ReportGenerator -Version 4.1.4 -OutputDirectory tools +dotnet build -c Release +for /F "tokens=* USEBACKQ" %%D in (`where dotnet`) do ( +set DotNetPath=%%D +) +tools\OpenCover.4.7.922\tools\OpenCover.Console.exe ^ + "-target:%DotNetPath%" ^ + "-targetArgs:test -c Release --no-build" ^ + "-filter:+[Cursively]* +[Cursively.*]* -[Cursively.Tests]*" ^ + -output:tools\raw-coverage-results.xml ^ + -register:user ^ + -oldstyle + +dotnet clean -c Release + +dotnet tools\ReportGenerator.4.1.4\tools\netcoreapp2.1\ReportGenerator.dll ^ + -reports:tools\raw-coverage-results.xml ^ + -targetdir:tools\results + +tools\results\index.htm + +popd diff --git a/doc/articles/.gitkeep b/doc/articles/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/doc/docfx.json b/doc/docfx.json new file mode 100644 index 0000000..c933dc2 --- /dev/null +++ b/doc/docfx.json @@ -0,0 +1,46 @@ +{ + "metadata": [ + { + "src": [ + { + "files": [ "**.csproj" ], + "src": "../src" + } + ], + "dest": "obj/api" + } + ], + "build": { + "content": [ + { + "files": [ "**/*.yml" ], + "src": "obj/api", + "dest": "api" + }, + { + "files": ["articles/**.md", "articles/**/toc.yml", "toc.yml", "*.md" ] + } + ], + "resource": [ + { + "files": [ "images/**" ] + } + ], + "overwrite": "overwrite/**.md", + "dest": "obj/generated-site-content", + "globalMetadataFiles": [], + "fileMetadataFiles": [], + "template": [ "default" ], + "postProcessors": [], + "markdownEngineName": "markdig", + "xrefService": [ "https://xref.docs.microsoft.com/query?uid={uid}" ], + "globalMetadata": { + "_appTitle": "Cursively", + "_enableSearch": true, + "_gitContribute": { + "apiSpecFolder": "doc/overwrite" + } + }, + "maxParallelism": 1 + } +} diff --git a/doc/images/.gitkeep b/doc/images/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/doc/index.md b/doc/index.md new file mode 100644 index 0000000..586cc28 --- /dev/null +++ b/doc/index.md @@ -0,0 +1,30 @@ +# Cursively +A fast, [RFC 4180](https://tools.ietf.org/html/rfc4180)-conforming CSV reading library for .NET. Written in C#. + +Fully supports all UTF-8 encoded byte streams. +- Other encodings will work as well, as long as the bytes `0x0A`, `0x0D`, `0x22`, and `0x2C` are all guaranteed to mean the same thing that they mean in ASCII / UTF-8, and as long as the encoding defines no other byte sequences which identify the Unicode code points for `'\n'`, `'\r'`, `'"'`, or `','`, respectively. +- In practice, this means that most "Extended ASCII" code pages will probably work, probably including all SBCS. Many "Extended ASCII" DBCS will probably work too, but it looks like Shift-JIS will *not* work. +- Notably, this library will fail to yield the correct result when used with byte streams encoded in any variant of UTF-16 or UTF-32, even with a BOM header. If you require that support, there are other libraries that should work for you. + +Fully supports all streams that completely conform to the RFC 4180 format, and defines rules for how to handle streams that break certain rules of RFC 4180 in a way that seems to be consistent with other popular tools, at a minor speed penalty. + +This library exists because the original developer was unsatisfied with the performance characteristics of raw CSV processing tools. Everything out there seemed to have some combination of these flaws: +1. Tons of managed heap allocations on hot paths, often baked into the API requirements +1. Decoding to UTF-16LE **before** scanning for critical bytes, which could be considered a subset of: +1. The design forces a ton of processing to happen on the input which the caller might not even care about +1. Omitting important parts of RFC 4180 +1. Disappointing options for mitigating DDoS risk + +"RFC 4180 over UTF-8" is a very simple byte stream format, and the state machine requires only a few extra states to define how to handle all UTF-8 streams that are non-RFC 4180, so it seemed odd that there wasn't a reader without these flaws. + +With Cursively, +1. each stream only strictly requires a grand total of two objects to be allocated on the managed heap*, + - *in case this is too much, both could be reset and put into a pool to be reused for processing other streams +1. processing happens directly on the input bytes (no decoding is done by Cursively itself), +1. the only processing that Cursively necessarily does is the bare minimum needed to describe the data to the caller, +1. inputs that conform to RFC 4180* are processed according to all the rules of RFC 4180, and + - *inputs that do not conform to RFC 4180 are handled according to consistent, intuitive rules +1. there is a very low risk* of DDoS directly from using Cursively, and the caller has the tools that they need in order to prevent (or respond to) attacks in a more "natural" way than other CSV libraries that the developer has seen. + - *There is no such thing as "risk-free" in our world. Cursively itself cannot eliminate the risk of attacks that use it as a vector to exploit defects in CoreFX / C# compiler / runtime / OS / hardware. + +Future enhancements may add support for byte streams in other encodings if there's demand for it, but not at the expense of anything that matters to the "RFC 4180 over UTF-8" use case. diff --git a/doc/overwrite/.gitkeep b/doc/overwrite/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/doc/release-notes.md b/doc/release-notes.md new file mode 100644 index 0000000..8c1dc2d --- /dev/null +++ b/doc/release-notes.md @@ -0,0 +1,4 @@ +# Cursively Release Notes + +## 1.0.0 +- Initial release. diff --git a/doc/toc.yml b/doc/toc.yml new file mode 100644 index 0000000..aa4ba7c --- /dev/null +++ b/doc/toc.yml @@ -0,0 +1,10 @@ +- name: Home + href: index.md +- name: API Documentation + href: obj/api/ +- name: Release Notes + href: release-notes.md +- name: NuGet Package + href: https://www.nuget.org/packages/Cursively +- name: GitHub + href: https://github.com/airbreather/Cursively diff --git a/generate-docs.cmd b/generate-docs.cmd new file mode 100644 index 0000000..b4f9dff --- /dev/null +++ b/generate-docs.cmd @@ -0,0 +1,17 @@ +@echo off +REM =========================================================================== +REM Regenerates the https://airbreather.github.io/Cursively content locally +REM =========================================================================== +set DOCFX_PACKAGE_VERSION=2.42.4 +pushd %~dp0 +REM incremental / cached builds tweak things about the output, so let's do it +REM all fresh if we can help it... +rd /s /q src\Cursively\obj +rd /s /q doc\obj +dotnet restore +pushd tools +rd /s /q docfx.console.%DOCFX_PACKAGE_VERSION% +nuget install docfx.console -Version %DOCFX_PACKAGE_VERSION% +popd +%~dp0\tools\docfx.console.%DOCFX_PACKAGE_VERSION%\tools\docfx doc\docfx.json +popd diff --git a/push-docs.cmd b/push-docs.cmd new file mode 100644 index 0000000..12188f6 --- /dev/null +++ b/push-docs.cmd @@ -0,0 +1,20 @@ +@echo off +REM =========================================================================== +REM Pushes the https://airbreather.github.io/Cursively content to the server +REM (run generate-docs.cmd first) +REM =========================================================================== +pushd %~dp0 +pushd doc\obj +rd /s /q gh-pages +git clone --branch gh-pages %~dp0 gh-pages +pushd gh-pages +git rm -r . +xcopy /Q /E /R /Y ..\generated-site-content . +git add . +git commit -m "Update docs. This was performed automatically." +git push origin gh-pages +popd +rd /s /q gh-pages +popd +git push origin gh-pages +popd diff --git a/run-benchmark.cmd b/run-benchmark.cmd new file mode 100644 index 0000000..4b05088 --- /dev/null +++ b/run-benchmark.cmd @@ -0,0 +1,4 @@ +pushd %~dp0 +dotnet run -c Release --project test\Cursively.Benchmark\Cursively.Benchmark.csproj --framework netcoreapp2.2 +for %%D in (BenchmarkDotNet.Artifacts\results\*.html) do %%D +popd diff --git a/src/Cursively/CsvReaderVisitorBase.cs b/src/Cursively/CsvReaderVisitorBase.cs index 1b804ab..5b0eea6 100644 --- a/src/Cursively/CsvReaderVisitorBase.cs +++ b/src/Cursively/CsvReaderVisitorBase.cs @@ -38,16 +38,22 @@ public abstract class CsvReaderVisitorBase /// /// /// + /// /// Field is split across multiple read buffer chunks, or else it runs up to the very end of /// a read buffer chunk, but we can't prove it without the first byte of the next chunk or a /// call. + /// /// /// + /// /// Quoted field contains a literal quote that was escaped in the original stream, and so we /// cannot yield the entire field data as-is. + /// /// /// + /// /// Stream does not conform to RFC 4180, and optimizing such streams to avoid this case. + /// /// /// /// diff --git a/src/Cursively/CsvTokenizer.cs b/src/Cursively/CsvTokenizer.cs index e5c3fe7..8dd039e 100644 --- a/src/Cursively/CsvTokenizer.cs +++ b/src/Cursively/CsvTokenizer.cs @@ -10,9 +10,9 @@ namespace Cursively /// /// The byte stream is tokenized according to the rules of the ASCII encoding. This makes it /// compatible with any encoding that encodes 0x0A, 0x0D, 0x22, and 0x2C the same way that ASCII - /// encodes them. Windows code pages and UTF-8 are notable examples of acceptable encodings. + /// encodes them. UTF-8 and Extended ASCII SBCS are notable examples of acceptable encodings. /// UTF-16 is a notable example of an unacceptable encoding; trying to use this class to process - /// text encoded in any other encoding will yield undesirable results without any errors. + /// text encoded in an unacceptable encoding will yield undesirable results without any errors. /// /// /// All bytes that appear in the stream except 0x0A, 0x0D, 0x22, and 0x2C are unconditionally @@ -38,38 +38,52 @@ namespace Cursively /// /// /// + /// /// The spec says that separate lines are delimited by CRLF line breaks. This implementation /// accepts line breaks of any format (CRLF, LF, CR). + /// /// /// + /// /// The spec says that there may or may not be a line break at the end of the last record in the /// stream. This implementation does not require there to be a line break, and it would not /// hurt to add one either. + /// /// /// + /// /// The spec refers to an optional header line at the beginning. This implementation does not /// include any special treatment for the first line of fields; if they need to be treated as /// headers, then the consumer needs to know that and respond accordingly. + /// /// /// + /// /// The spec says each record may contain "one or more fields". This implementation interprets /// that to mean strictly that any number of consecutive newline characters in a row are treated /// as one. + /// /// /// + /// /// Many implementations allow the delimiter character to be configured to be something else /// other than a comma. This implementation does not currently offer that flexibility. + /// /// /// + /// /// Many implementations allow automatically trimming whitespace at the beginning and/or end of /// each field (sometimes optionally). The spec expressly advises against doing that, and this /// implementation follows suit. It is our opinion that consumers ought to be more than capable /// of trimming spaces at the beginning or end as part of their processing if this is desired. + /// /// /// + /// /// The spec says that the last field in a record must not be followed by a comma. This /// implementation interprets that to mean that if we do see a comma followed immediately by a /// line ending character, then that represents the data for an empty field. + /// /// /// /// @@ -146,10 +160,10 @@ public class CsvTokenizer private enum ParserFlags : byte { None, - ReadAnythingOnCurrentLine = 0b00000001, - ReadAnythingInCurrentField = 0b00000010, - CurrentFieldStartedWithQuote = 0b00000100, - QuotedFieldDataEnded = 0b00001000, + ReadAnythingOnCurrentLine = 0b00000001, + ReadAnythingInCurrentField = 0b00000010, + CurrentFieldStartedWithQuote = 0b00000100, + QuotedFieldDataEnded = 0b00001000, CutAtPotentiallyTerminalDoubleQuote = 0b00010000, } @@ -185,7 +199,7 @@ public void ProcessNextChunk(ReadOnlySpan chunk, CsvReaderVisitorBase visi { // most of the time, we should be able to fully process each field in the same // loop iteration that we first start reading it. the most prominent exception - // is that + // is when we encounter a quoted field. PickUpFromLastTime(ref chunk, visitor); continue; } @@ -369,8 +383,8 @@ private void HandleBufferCutAtPotentiallyTerminalDoubleQuote(ref ReadOnlySpannetstandard2.0 + + Cursively + Cursively - Fast CSV Processing + A .NET library for RFC4180-compliant CSV processing that's still fast and fault-tolerant. + airbreather + airbreather + MIT + csv;comma;separated;value + + diff --git a/src/Directory.Build.props b/src/Directory.Build.props index eec04f4..a83c324 100644 --- a/src/Directory.Build.props +++ b/src/Directory.Build.props @@ -4,7 +4,26 @@ - true + true + + airbreather + Copyright © 2019 - $([System.DateTime]::UtcNow.Year) airbreather + + true + true + true + snupkg + + + false + + + false + + + + + diff --git a/src/Cursively.Benchmark/Cursively.Benchmark.csproj b/test/Cursively.Benchmark/Cursively.Benchmark.csproj similarity index 93% rename from src/Cursively.Benchmark/Cursively.Benchmark.csproj rename to test/Cursively.Benchmark/Cursively.Benchmark.csproj index 122f96c..4620fe7 100644 --- a/src/Cursively.Benchmark/Cursively.Benchmark.csproj +++ b/test/Cursively.Benchmark/Cursively.Benchmark.csproj @@ -4,8 +4,6 @@ Exe netcoreapp2.2;net472 true - - false diff --git a/src/Cursively.Benchmark/Program.cs b/test/Cursively.Benchmark/Program.cs similarity index 88% rename from src/Cursively.Benchmark/Program.cs rename to test/Cursively.Benchmark/Program.cs index 7a981ff..38d3d9c 100644 --- a/src/Cursively.Benchmark/Program.cs +++ b/test/Cursively.Benchmark/Program.cs @@ -58,7 +58,7 @@ public long CountRowsUsingCsvHelper(CsvFile csvFile) } } - static int Main() + private static int Main() { var prog = new Program(); foreach (var csvFile in CsvFiles) @@ -88,11 +88,9 @@ public CsvFile(string fullPath) => public override string ToString() => FileName; } - static CsvFile[] GetCsvFiles([CallerFilePath]string myLocation = null) - { - return Array.ConvertAll(Directory.GetFiles(Path.Combine(Path.GetDirectoryName(myLocation), "large-csv-files"), "*.csv"), - fullPath => new CsvFile(fullPath)); - } + private static CsvFile[] GetCsvFiles([CallerFilePath]string myLocation = null) => + Array.ConvertAll(Directory.GetFiles(Path.Combine(Path.GetDirectoryName(myLocation), "large-csv-files"), "*.csv"), + fullPath => new CsvFile(fullPath)); private sealed class RowCountingVisitor : CsvReaderVisitorBase { @@ -100,7 +98,7 @@ private sealed class RowCountingVisitor : CsvReaderVisitorBase public long RowCount { get; private set; } - public override void VisitEndOfRecord() => ++this.RowCount; + public override void VisitEndOfRecord() => ++RowCount; public override void VisitEndOfField(ReadOnlySpan chunk) { } diff --git a/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv b/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv new file mode 100644 index 0000000..718947c --- /dev/null +++ b/test/Cursively.Benchmark/large-csv-files/100-huge-records-quoted.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:611a7ba4f69bf3ab34f1fbf3fbf4711bfa8fb91a210683bdf4c1915818f1cfe0 +size 4900444 diff --git a/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv b/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv new file mode 100644 index 0000000..fde3ed5 --- /dev/null +++ b/test/Cursively.Benchmark/large-csv-files/100-huge-records.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e82c977d84c24a6b16063b634cbeab1e8409b34724b0ecf07893f45f8aadb53 +size 2900444 diff --git a/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv b/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv new file mode 100644 index 0000000..61dd063 --- /dev/null +++ b/test/Cursively.Benchmark/large-csv-files/10k-empty-records.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f1e211bf4eb14ab578ccf6aff141e8db41e80314b39b85fba5f047830f746e4 +size 10020000 diff --git a/src/Cursively.Benchmark/large-csv-files/mocked.csv b/test/Cursively.Benchmark/large-csv-files/mocked.csv similarity index 100% rename from src/Cursively.Benchmark/large-csv-files/mocked.csv rename to test/Cursively.Benchmark/large-csv-files/mocked.csv diff --git a/test/Cursively.Tests/CsvTokenizerTests.cs b/test/Cursively.Tests/CsvTokenizerTests.cs index f1ab8e9..0c18b40 100644 --- a/test/Cursively.Tests/CsvTokenizerTests.cs +++ b/test/Cursively.Tests/CsvTokenizerTests.cs @@ -94,7 +94,7 @@ private static IEnumerable TokenizeCsvFileUsingCsvHelper(byte[] csvDat private static byte[][] VaryLineEndings(ReadOnlySpan fileData, int randomSeed) { - var resultLists = new List[] + List[] resultLists = { new List(), new List(), @@ -104,7 +104,7 @@ private static byte[][] VaryLineEndings(ReadOnlySpan fileData, int randomS new List(), }; - var lineEndings = new byte[][] + byte[][] lineEndings = { new byte[] { (byte)'\r' }, new byte[] { (byte)'\n' }, diff --git a/test/Directory.Build.props b/test/Directory.Build.props index cbf08bc..9399748 100644 --- a/test/Directory.Build.props +++ b/test/Directory.Build.props @@ -3,4 +3,8 @@ + + False + + diff --git a/tools/.gitignore b/tools/.gitignore new file mode 100644 index 0000000..f59ec20 --- /dev/null +++ b/tools/.gitignore @@ -0,0 +1 @@ +* \ No newline at end of file