Skip to content

Commit e7e2ac8

Browse files
authored
Merge 40611ee into 97cc023
2 parents 97cc023 + 40611ee commit e7e2ac8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1556
-450
lines changed

.github/workflows/build-test-publish.yaml

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,12 @@ jobs:
3737

3838
- name: Test Report
3939
uses: bibipkins/[email protected]
40+
if: github.repository == 'mukunku/ParquetViewer'
4041
with:
4142
github-token: ${{ secrets.GITHUB_TOKEN }}
4243
comment-title: 'Unit Test Results'
4344
results-path: ./src/ParquetViewer.Tests/TestResults/*.trx
44-
45+
4546
checkPublish:
4647
runs-on: windows-latest
4748
needs: test
@@ -70,7 +71,7 @@ jobs:
7071
id: check-release
7172
with:
7273
tag: 'v${{ steps.release-version.outputs.release_version }}'
73-
74+
7475
- id: should-publish
7576
run: |
7677
"should_publish=" + (
@@ -97,8 +98,7 @@ jobs:
9798
with:
9899
dotnet-version: '8.0.x'
99100

100-
# Inject Amplitude API Key
101-
- name: Replace single file
101+
- name: Inject Amplitude API Key
102102
uses: richardrigutins/replace-in-files@v2
103103
with:
104104
files: 'src/ParquetViewer/Analytics/AmplitudeEvent.cs'
@@ -107,32 +107,60 @@ jobs:
107107

108108
- name: Build & Publish Regular Release
109109
run: dotnet publish src/ParquetViewer/ParquetViewer.csproj -c Release -f net8.0-windows --nologo -o publish -r win-x64 --self-contained false
110-
110+
111111
- name: Build & Publish SelfContained Release
112112
run: dotnet publish src/ParquetViewer/ParquetViewer.csproj -c Release_SelfContained -f net8.0-windows --nologo -o publish_selfcontained -r win-x64 --self-contained true
113113

114+
- name: Prepare executables for upload
115+
run: |
116+
Move-Item -Path "publish/ParquetViewer.exe" -Destination "./ParquetViewer.exe"
117+
Move-Item -Path "publish_selfcontained/ParquetViewer.exe" -Destination "./ParquetViewer_SelfContained.exe"
118+
119+
- name: Upload unsigned artifact for signing
120+
id: upload-unsigned-artifact
121+
uses: actions/upload-artifact@v4
122+
with:
123+
path: |
124+
ParquetViewer.exe
125+
ParquetViewer_SelfContained.exe
126+
127+
- name: Remove unsigned exe's for safety
128+
run: |
129+
Remove-Item -Path "ParquetViewer.exe"
130+
Remove-Item -Path "ParquetViewer_SelfContained.exe"
131+
132+
# Documentation: https://about.signpath.io/documentation/trusted-build-systems/github
133+
- name: Submit signing request to SignPath.io
134+
uses: signpath/[email protected]
135+
with:
136+
api-token: '${{ secrets.SIGNPATH_API_TOKEN }}'
137+
organization-id: '5ceccea7-c3e7-4165-8c2e-adab8679db20'
138+
project-slug: 'ParquetViewer'
139+
signing-policy-slug: 'release-signing'
140+
github-artifact-id: '${{ steps.upload-unsigned-artifact.outputs.artifact-id }}'
141+
wait-for-completion: true
142+
output-artifact-directory: '/signed-package'
143+
114144
- name: Generate _checksums.txt
115145
run: |
116-
Move-Item -Path "publish/ParquetViewer.exe" -Destination "ParquetViewer.exe"
117-
Move-Item -Path "publish_selfcontained/ParquetViewer.exe" -Destination "ParquetViewer_SelfContained.exe"
118-
$fileHash = (Get-FileHash ParquetViewer.exe -Algorithm SHA256)
119-
$fileHashSelfContained = (Get-FileHash ParquetViewer_SelfContained.exe -Algorithm SHA256)
120-
$fileSize = (Get-Item -Path "ParquetViewer.exe").Length
121-
$fileSizeSelfContained = (Get-Item -Path "ParquetViewer_SelfContained.exe").Length
146+
$fileHash = (Get-FileHash "signed-package/ParquetViewer.exe" -Algorithm SHA256)
147+
$fileHashSelfContained = (Get-FileHash "signed-package/ParquetViewer_SelfContained.exe" -Algorithm SHA256)
148+
$fileSize = (Get-Item -Path "signed-package/ParquetViewer.exe").Length
149+
$fileSizeSelfContained = (Get-Item -Path "signed-package/ParquetViewer_SelfContained.exe").Length
122150
"v${{ env.VERSION_NUMBER }}" >> "_checksums.txt"
123151
"" >> "_checksums.txt"
124152
"Name: ParquetViewer.exe" >> "_checksums.txt"
125153
"Size: $fileSize bytes ($([math]::floor($fileSize/1KB)) KiB)" >> "_checksums.txt"
126154
"SHA256: $($fileHash.Hash)" >> "_checksums.txt"
127155
"" >> "_checksums.txt"
128156
"Name: ParquetViewer_SelfContained.exe" >> "_checksums.txt"
129-
"Size: $fileSizeSelfContained bytes ($([math]::floor($fileSizeSelfContained/1KB)) KiB)" >> "_checksums.txt"
157+
"Size: $fileSizeSelfContained bytes ($([math]::floor($fileSizeSelfContained/1MB)) MiB)" >> "_checksums.txt"
130158
"SHA256: $($fileHashSelfContained.Hash)" >> "_checksums.txt"
131159
"" >> "_checksums.txt"
132160
133161
- uses: ncipollo/release-action@v1
134162
with:
135-
artifacts: "ParquetViewer.exe,ParquetViewer_SelfContained.exe,_checksums.txt"
163+
artifacts: "signed-package/ParquetViewer.exe,signed-package/ParquetViewer_SelfContained.exe,_checksums.txt"
136164
body: "PR: #${{ env.PR_NUMBER }}"
137165
allowUpdates: ${{ env.BRANCH_NAME != 'main' }}
138166
omitBodyDuringUpdate: true
@@ -141,3 +169,4 @@ jobs:
141169
replacesArtifacts: true
142170
updateOnlyUnreleased: true
143171
tag: v${{ env.VERSION_NUMBER }}
172+
commit: ${{ github.sha }}

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ Checkout the [ParquetViewer Analytics Dashboard](https://app.amplitude.com/analy
2626
[^1]: Full privacy policy here: https://github.com/mukunku/ParquetViewer/wiki/Privacy-Policy
2727

2828
# Technical Details
29-
The latest version of this project was written in C# using Microsoft Visual Studio Community 2022 v17.12.3 and .NET 8
29+
The latest version of this project was written in C# using Microsoft Visual Studio Community 2022 v17.13.1 and .NET 8
3030

3131
# Acknowledgements
32-
This utility would not be possible without: https://github.com/aloneguid/parquet-dotnet
32+
* This utility would not be possible without: https://github.com/aloneguid/parquet-dotnet
33+
* Special thanks to [SignPath.io](https://about.signpath.io/) for sponsoring this project. See our [Code signing policy](https://github.com/mukunku/ParquetViewer/wiki/Code-signing-policy) for details.
34+

src/Directory.Packages.props

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@
33
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
44
</PropertyGroup>
55
<ItemGroup>
6-
<PackageVersion Include="Apache.Arrow" Version="19.0.0" />
7-
<PackageVersion Include="Parquet.Net" Version="5.1.1-pre.2" />
6+
<PackageVersion Include="Apache.Arrow" Version="20.0.0" />
7+
<PackageVersion Include="Parquet.Net" Version="5.1.1" />
88
<PackageVersion Include="Microsoft.CSharp" Version="4.7.0" />
9-
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.12.0" />
9+
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.14.0" />
1010
<PackageVersion Include="RichardSzalay.MockHttp" Version="7.0.0" />
11-
<PackageVersion Include="xunit" Version="2.9.2" />
12-
<PackageVersion Include="xunit.runner.visualstudio" Version="3.0.0" />
11+
<PackageVersion Include="xunit" Version="2.9.3" />
12+
<PackageVersion Include="xunit.runner.visualstudio" Version="3.1.0" />
1313
<PackageVersion Include="coverlet.collector" Version="6.0.2" />
1414
<PackageVersion Include="System.Data.DataSetExtensions" Version="4.5.0" />
1515
</ItemGroup>

src/ParquetViewer.Engine/ParquetEngine.Processor.cs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,11 @@ private static async Task ReadMapField(DataTableLite dataTable, ParquetRowGroupR
344344
var keyDataColumn = await groupReader.ReadColumnAsync(keyField.DataField!, cancellationToken);
345345
var valueDataColumn = await groupReader.ReadColumnAsync(valueField.DataField!, cancellationToken);
346346

347-
var dataEnumerable = Helpers.PairEnumerables(keyDataColumn.Data.Cast<object?>(), valueDataColumn.Data.Cast<object?>(), DBNull.Value);
347+
var dataEnumerable = Helpers.PairEnumerables(
348+
keyDataColumn.Data.Cast<object?>().Select(key => key ?? DBNull.Value),
349+
valueDataColumn.Data.Cast<object?>().Select(value => value ?? DBNull.Value),
350+
DBNull.Value);
351+
348352
//Some parquet writers don't write null entries into the data array for empty and null maps.
349353
//This throws off our logic below so lets find all empty/null maps and add a null entry into
350354
//the data array to align it with the repetition levels.
@@ -478,7 +482,18 @@ private async Task ReadStructField(DataTableLite dataTable, ParquetRowGroupReade
478482
dataTable.NewRow();
479483
}
480484

481-
dataTable.Rows[rowIndex]![fieldIndex] = new StructValue(field.Path, finalResultDataTable.Rows[i]);
485+
//Not sure how to detect if a Struct is NULL vs. all its fields being NULL.
486+
//For now lets consider that if all fields are NULL, the row is supposed to be NULL.
487+
bool isNull = !finalResultDataTable.Rows[i].ItemArray.Any(item => item != DBNull.Value);
488+
489+
if (isNull)
490+
{
491+
dataTable.Rows[rowIndex]![fieldIndex] = DBNull.Value;
492+
}
493+
else
494+
{
495+
dataTable.Rows[rowIndex]![fieldIndex] = new StructValue(field.Path, finalResultDataTable.Rows[i]);
496+
}
482497
rowIndex++;
483498
}
484499
}

src/ParquetViewer.Engine/ParquetEngine.cs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ namespace ParquetViewer.Engine
77
{
88
public partial class ParquetEngine : IDisposable
99
{
10-
private readonly List<ParquetReader> _parquetFiles;
10+
private readonly ParquetReader[] _parquetFiles;
1111
private long? _recordCount;
1212

1313
public long RecordCount => _recordCount ??= _parquetFiles.Sum(pf => pf.Metadata?.NumRows ?? 0);
1414

15-
public int NumberOfPartitions => _parquetFiles.Count;
15+
public int NumberOfPartitions => _parquetFiles.Length;
1616

1717
private ParquetReader DefaultReader => _parquetFiles.FirstOrDefault() ?? throw new ParquetEngineException("No parquet readers available");
1818

@@ -29,6 +29,12 @@ public partial class ParquetEngine : IDisposable
2929

3030
public string OpenFileOrFolderPath { get; }
3131

32+
private ParquetEngine(string fileOrFolderPath, params ParquetReader[] parquetFiles)
33+
{
34+
_parquetFiles = parquetFiles ?? throw new ArgumentNullException(nameof(parquetFiles), "No parquet readers provided");
35+
OpenFileOrFolderPath = fileOrFolderPath;
36+
}
37+
3238
private ParquetSchemaElement BuildParquetSchemaTree()
3339
{
3440
var thriftSchema = ThriftMetadata.Schema ?? throw new ParquetException("No thrift metadata was found");
@@ -62,12 +68,6 @@ private static ParquetSchemaElement ReadSchemaTree(ref List<SchemaElement>.Enume
6268
return parquetSchemaElement;
6369
}
6470

65-
private ParquetEngine(string fileOrFolderPath, List<ParquetReader> parquetFiles)
66-
{
67-
_parquetFiles = parquetFiles ?? throw new ParquetEngineException("No parquet readers found");
68-
OpenFileOrFolderPath = fileOrFolderPath;
69-
}
70-
7171
public static Task<ParquetEngine> OpenFileOrFolderAsync(string fileOrFolderPath, CancellationToken cancellationToken)
7272
{
7373
if (File.Exists(fileOrFolderPath)) //Handles null
@@ -94,7 +94,7 @@ public static async Task<ParquetEngine> OpenFileAsync(string parquetFilePath, Ca
9494
try
9595
{
9696
var parquetReader = await ParquetReader.CreateAsync(parquetFilePath, null, cancellationToken);
97-
return new ParquetEngine(parquetFilePath, new List<ParquetReader> { parquetReader });
97+
return new ParquetEngine(parquetFilePath, parquetReader);
9898
}
9999
catch (Exception ex)
100100
{
@@ -161,7 +161,7 @@ public static async Task<ParquetEngine> OpenFolderAsync(string folderPath, Cance
161161

162162
cancellationToken.ThrowIfCancellationRequested();
163163

164-
return new ParquetEngine(folderPath, fileGroups.Values.First());
164+
return new ParquetEngine(folderPath, fileGroups.Values.First().ToArray());
165165
}
166166

167167
private IEnumerable<(long RemainingOffset, ParquetReader ParquetReader)> GetReaders(long offset)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
using ParquetViewer.Engine.Types;
2+
3+
namespace ParquetViewer.Engine
4+
{
5+
//Global settings, what can go wrong? It's convenient, though.
6+
public static class ParquetEngineSettings
7+
{
8+
/// <summary>
9+
/// By default Parquet Engine will render Dates using the system culture's format.
10+
/// By setting this value a custom date format can be used instead.
11+
/// </summary>
12+
/// <remarks>Parquet Engine renders dates when converting <see cref="ListValue"/>,
13+
/// <see cref="StructValue"/>, and <see cref="MapValue"/> types to string.</remarks>
14+
public static string? DateDisplayFormat { get; set; }
15+
}
16+
}

src/ParquetViewer.Engine/Types/ByteArrayValue.cs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,23 @@ public ByteArrayValue(string name, byte[] data)
1313

1414
public override string ToString() => BitConverter.ToString(this.Data);
1515

16+
public string ToStringTruncated(int desiredLength)
17+
{
18+
var bytesNeededToGetLength = StringLengthToByteCount(desiredLength);
19+
if (this.Data.Length < bytesNeededToGetLength)
20+
{
21+
return ToString();
22+
}
23+
24+
//We're going to return a bit more than desiredLength here but we can live with that
25+
return BitConverter.ToString(this.Data, 0 , bytesNeededToGetLength / 2)
26+
+ "[...]" + BitConverter.ToString(this.Data, this.Data.Length - (bytesNeededToGetLength / 2));
27+
}
28+
29+
//Calculates how many bytes are needed to generate a string of the given length.
30+
private static int StringLengthToByteCount(int stringLength)
31+
=> (stringLength + 1) / 3; //One byte = 3 chars. E.g. AA- (-1 for the last byte which won't have a dash)
32+
1633
public int CompareTo(ByteArrayValue? other)
1734
{
1835
if (other?.Data is null)

src/ParquetViewer.Engine/Types/ListValue.cs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ public class ListValue : IComparable<ListValue>, IComparable, IEnumerable<object
77
{
88
public IList Data { get; }
99
public Type? Type { get; private set; }
10-
public static string? DateDisplayFormat { get; set; }
1110

1211
public ListValue(Array data)
1312
{
@@ -44,8 +43,8 @@ public override string ToString()
4443
if (!isFirst)
4544
sb.Append(',');
4645

47-
if (data is DateTime dt && DateDisplayFormat is not null)
48-
sb.Append(dt.ToString(DateDisplayFormat));
46+
if (data is DateTime dt && ParquetEngineSettings.DateDisplayFormat is not null)
47+
sb.Append(dt.ToString(ParquetEngineSettings.DateDisplayFormat));
4948
else
5049
sb.Append(data?.ToString() ?? string.Empty);
5150

src/ParquetViewer.Engine/Types/MapValue.cs

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
using System;
2-
using System.Collections;
3-
using System.Collections.Generic;
1+
using System.Collections;
42
using System.Text;
53

64
namespace ParquetViewer.Engine.Types
@@ -11,7 +9,6 @@ public class MapValue : IComparable<MapValue>, IComparable, IEnumerable<(object
119
public Type KeyType { get; }
1210
public ArrayList Values { get; }
1311
public Type ValueType { get; }
14-
public static string? DateDisplayFormat { get; set; }
1512

1613
public MapValue(ArrayList keys, Type keyType, ArrayList values, Type valueType)
1714
{
@@ -25,11 +22,11 @@ public MapValue(ArrayList keys, Type keyType, ArrayList values, Type valueType)
2522
Keys = keys;
2623
Values = values;
2724

28-
var mismatchedType = keys.Cast<object?>().Where(key => key != DBNull.Value && key != null).FirstOrDefault(key => key!.GetType() != keyType);
25+
var mismatchedType = keys.Cast<object?>().Where(key => key != DBNull.Value).FirstOrDefault(key => key!.GetType() != keyType);
2926
if (mismatchedType != null)
3027
throw new ArgumentException($"The key's type {mismatchedType} doesn't match the passed key-type {keyType}");
3128

32-
mismatchedType = values.Cast<object?>().Where(value => value != DBNull.Value && value != null).FirstOrDefault(value => value!.GetType() != valueType);
29+
mismatchedType = values.Cast<object?>().Where(value => value != DBNull.Value).FirstOrDefault(value => value!.GetType() != valueType);
3330
if (mismatchedType != null)
3431
throw new ArgumentException($"The value's type {mismatchedType} doesn't match the passed value-type {valueType}");
3532

@@ -60,14 +57,14 @@ public override string ToString()
6057
static string FormatString((object Key, object Value) map)
6158
{
6259
string key;
63-
if (map.Key is DateTime dt && DateDisplayFormat is not null)
64-
key = dt.ToString(DateDisplayFormat);
60+
if (map.Key is DateTime dt && ParquetEngineSettings.DateDisplayFormat is not null)
61+
key = dt.ToString(ParquetEngineSettings.DateDisplayFormat);
6562
else
6663
key = map.Key?.ToString() ?? string.Empty;
6764

6865
string value;
69-
if (map.Value is DateTime dt2 && DateDisplayFormat is not null)
70-
value = dt2.ToString(DateDisplayFormat);
66+
if (map.Value is DateTime dt2 && ParquetEngineSettings.DateDisplayFormat is not null)
67+
value = dt2.ToString(ParquetEngineSettings.DateDisplayFormat);
7168
else
7269
value = map.Value?.ToString() ?? string.Empty;
7370

0 commit comments

Comments
 (0)